CNTK now supports CUDA 9/cuDNN 7. This requires an update to build environment to Ubuntu 16/GCC 5 for Linux, and Visual Studio 2017/VCTools 14.11 for Windows. With CUDA 9, CNTK also added a preview for 16-bit floating point (a.k.a FP16) computation.

Please check out the example of FP16 in ResNet50 at /Examples/Image/Classification/ResNet/Python/TrainResNet_ImageNet_Distributed.py

Notes on FP16 preview:
* FP16 implementation on CPU is not optimized, and it's not supposed to be used in CPU inference directly. User needs to convert the model to 32-bit floating point before running on CPU.
* Loss/Criterion for FP16 training needs to be 32bit for accumulation without overflow, using cast function. Please check the example above.
* Readers do not have FP16 output unless using numpy to feed data, cast from FP32 to FP16 is needed. Please check the example above.
* FP16 gradient aggregation is currently only implemented on GPU using NCCL2. Distributed training with FP16 with MPI is not supported.
* FP16 math is a subset of current FP32 implementation. Some model may get Feature Not Implemented exception using FP16.
* FP16 is currently not supported in BrainScript. Please use Python for FP16.

To setup build and runtime environment on Windows:
* Install [Visual Studio 2017](https://www.visualstudio.com/downloads/) with following workloads and components. From command line (use Community version installer as example):
    vs_community.exe --add Microsoft.VisualStudio.Workload.NativeDesktop --add Microsoft.VisualStudio.Workload.ManagedDesktop --add Microsoft.VisualStudio.Workload.Universal --add Microsoft.Component.PythonTools --add Microsoft.VisualStudio.Component.VC.Tools.14.11
* Install [NVidia CUDA 9](https://developer.nvidia.com/cuda-90-download-archive?target_os=Windows&target_arch=x86_64)
* From PowerShell, run:
    /Tools/devInstall/Windows/DevInstall.ps1
* Start VCTools 14.11 command line, run:
    cmd /k "%VS2017INSTALLDIR%\VC\Auxiliary\Build\vcvarsall.bat" x64 --vcvars_ver=14.11
* Open /CNTK.sln from the VCTools 14.11 command line. Note that starting CNTK.sln other than VCTools 14.11 command line, would causes CUDA 9 [build error](https://developercommunity.visualstudio.com/content/problem/163758/vs-2017-155-doesnt-support-cuda-9.html).

To setup build and runtime environment on Linux using docker, please build Unbuntu 16.04 docker image using Dockerfiles /Tools/docker. For other Linux systems, please refer to the Dockerfiles to setup dependent libraries for CNTK.
This commit is contained in:
KeDengMS 2018-01-22 16:58:56 -08:00
Родитель 3765da9ae7
Коммит 3cf3af5df6
297 изменённых файлов: 154699 добавлений и 135575 удалений

1
.gitattributes поставляемый
Просмотреть файл

@ -65,6 +65,7 @@ Makefile text
*.asax text
*.h text
*.hpp text
*.cpp text
*.cc text
*.cu text

Просмотреть файл

@ -3,19 +3,10 @@
<Import Project="$(SolutionDir)\CNTK.Common.props" />
<PropertyGroup>
<CudaVersion />
<CudaVersion Condition="Exists('$(CUDA_PATH_V8_0)') And '$(CudaVersion)' == ''">8.0</CudaVersion>
<CudaVersion Condition="Exists('$(CUDA_PATH_V7_5)') And '$(CudaVersion)' == ''">7.5</CudaVersion>
<NvmlInclude />
<NvmlInclude Condition="'$(CudaVersion)' == '7.5'">"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include"</NvmlInclude>
<NvmlInclude Condition="'$(CudaVersion)' == '8.0'" />
<NvmlLibPath />
<NvmlLibPath Condition="'$(CudaVersion)' == '7.5'">"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</NvmlLibPath>
<NvmlLibPath Condition="'$(CudaVersion)' == '8.0'" />
<CudaVersion Condition="Exists('$(CUDA_PATH_V9_0)') And '$(CudaVersion)' == ''">9.0</CudaVersion>
<NvmlDll>%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml.dll</NvmlDll>
<NvmlDll Condition="Exists('c:\local\bindrop\NVSMI\nvml.dll')">c:\local\bindrop\NVSMI\nvml.dll</NvmlDll>
<NvmlDll Condition="Exists('c:\local\nvsmi9\NVSMI\nvml.dll')">c:\local\nvsmi9\NVSMI\nvml.dll</NvmlDll>
<HasOpenCv>false</HasOpenCv>
<HasOpenCv Condition="Exists('$(OPENCV_PATH)') Or Exists('$(OPENCV_PATH_V31)')">true</HasOpenCv>
@ -65,16 +56,20 @@
<PropertyGroup Condition="!$(IsUWP)">
<MathLibrary>MKL</MathLibrary>
<MathIncludePath>$(MKLML_PATH)\include</MathIncludePath>
<MathIncludePath>$(MKL_PATH)\include</MathIncludePath>
<MathDefine>USE_MKL</MathDefine>
<!-- Only non-UWP configurations consume PerformanceProfiler -->
<ReaderLibs>Cntk.PerformanceProfiler-$(CntkComponentVersion).lib;$(ReaderLibs)</ReaderLibs>
<MathLibraryName>MKL-ML Library</MathLibraryName>
<MathLibraryPath>$(MKLML_PATH)\lib</MathLibraryPath>
<MathLibraryName>MKL Library</MathLibraryName>
<MathLibraryPath>$(MKL_PATH)\lib</MathLibraryPath>
<MathLinkLibrary>mklml.lib</MathLinkLibrary>
<MathDelayLoad>mklml.dll</MathDelayLoad>
<MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
<UnitTestDlls>$(OutDir)mklml.lib;$(OutDir)libiomp5md.dll;</UnitTestDlls>
<HasMklDnn>false</HasMklDnn>
<HasMklDnn Condition="Exists('$(MKL_PATH)\include\mkldnn.h')">true</HasMklDnn>
<MathDefine Condition="$(HasMklDnn)">$(MathDefine);USE_MKLDNN</MathDefine>
<MathLinkLibrary Condition="$(HasMklDnn)">$(MathLinkLibrary);mkldnn.lib</MathLinkLibrary>
<MathDelayLoad Condition="$(HasMklDnn)">$(MathDelayLoad);mkldnn.dll</MathDelayLoad>
</PropertyGroup>
<PropertyGroup Condition="$(UseZip)">
<ZipInclude>$(ZLIB_PATH)\include;$(ZLIB_PATH)\lib\libzip\include;</ZipInclude>
@ -109,31 +104,19 @@
<ProtobufLib Condition="$(DebugBuild)">libprotobufd.lib</ProtobufLib>
</PropertyGroup>
<PropertyGroup Condition="'$(CudaVersion)' == '8.0'">
<CudaPath>$(CUDA_PATH_V8_0)</CudaPath>
<CudaRuntimeDll>cudart64_80.dll</CudaRuntimeDll>
<CudaDlls>cublas64_80.dll;cusparse64_80.dll;curand64_80.dll;$(CudaRuntimeDll)</CudaDlls>
<PropertyGroup Condition="'$(CudaVersion)' == '9.0'">
<CudaPath>$(CUDA_PATH_V9_0)</CudaPath>
<CudaRuntimeDll>cudart64_90.dll</CudaRuntimeDll>
<CudaDlls>cublas64_90.dll;cusparse64_90.dll;curand64_90.dll;$(CudaRuntimeDll)</CudaDlls>
<!-- Use NvidiaCompute to define nvcc target architectures (will generate code to support them all, i.e. fat-binary, in release mode)
In debug mode we only include cubin/PTX for 30 and rely on PTX / JIT to generate the required native cubin format
http://docs.nvidia.com/cuda/pascal-compatibility-guide/index.html#building-applications-with-pascal-support -->
<NvidiaCompute Condition="$(DebugBuild)">$(CNTK_CUDA_CODEGEN_DEBUG)</NvidiaCompute>
<NvidiaCompute Condition="$(DebugBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30</NvidiaCompute>
<NvidiaCompute Condition="$(ReleaseBuild)">$(CNTK_CUDA_CODEGEN_RELEASE)</NvidiaCompute>
<NvidiaCompute Condition="$(ReleaseBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_60,sm_60;compute_61,sm_61</NvidiaCompute>
</PropertyGroup>
<PropertyGroup Condition="'$(CudaVersion)' == '7.5'">
<CudaPath>$(CUDA_PATH_V7_5)</CudaPath>
<CudaRuntimeDll>cudart64_75.dll</CudaRuntimeDll>
<CudaDlls>cublas64_75.dll;cusparse64_75.dll;curand64_75.dll;$(CudaRuntimeDll)</CudaDlls>
<NvidiaCompute Condition="$(DebugBuild)">$(CNTK_CUDA_CODEGEN_DEBUG)</NvidiaCompute>
<NvidiaCompute Condition="$(DebugBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30</NvidiaCompute>
<NvidiaCompute Condition="$(ReleaseBuild)">$(CNTK_CUDA_CODEGEN_RELEASE)</NvidiaCompute>
<NvidiaCompute Condition="$(ReleaseBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30;compute_35,sm_35;compute_50,sm_50</NvidiaCompute>
<NvidiaCompute Condition="$(ReleaseBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70</NvidiaCompute>
</PropertyGroup>
<PropertyGroup>
@ -144,11 +127,14 @@
<CudaMsbuildPath Condition="'$(CudaMsbuildPath)' == ''">$(VCTargetsPath)\BuildCustomizations</CudaMsbuildPath>
</PropertyGroup>
<PropertyGroup>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<!-- TODO warn if ConfigurationType not (yet) defined -->
<PropertyGroup Condition="'$(ConfigurationType)' == 'StaticLibrary'">
<UseDebugLibraries>$(DebugBuild)</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
<WholeProgramOptimization>$(ReleaseBuild)</WholeProgramOptimization>
<LinkIncremental>$(DebugBuild)</LinkIncremental>
@ -159,6 +145,7 @@
<PreprocessorDefinitions>CNTK_COMPONENT_VERSION="$(CntkComponentVersion)"</PreprocessorDefinitions>
<!-- UWP does not use MPI -->
<PreprocessorDefinitions Condition="!$(IsUWP)">%(PreprocessorDefinitions);HAS_MPI=1</PreprocessorDefinitions>
<PreprocessorDefinitions Condition="'$(CudaVersion)' == '9.0'">%(PreprocessorDefinitions);CUDA_NO_HALF;__CUDA_NO_HALF_OPERATORS__</PreprocessorDefinitions>
</ClCompile>
</ItemDefinitionGroup>

780
CNTK.sln

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
@ -39,13 +39,11 @@
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
@ -39,13 +39,11 @@
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
@ -39,13 +39,11 @@
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
@ -42,13 +42,11 @@
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>

Просмотреть файл

@ -56,7 +56,7 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
# Train and evaluate the network.
def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_epochs, profiler_dir=None,
model_dir=None, log_dir=None, tensorboard_logdir=None, gen_heartbeat=False):
model_dir=None, log_dir=None, tensorboard_logdir=None, gen_heartbeat=False, fp16=False):
set_computation_network_trace_level(0)
@ -64,19 +64,32 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
input_var = C.input_variable((num_channels, image_height, image_width), name='features')
label_var = C.input_variable((num_classes))
# create model, and configure learning parameters
if network_name == 'resnet20':
z = create_cifar10_model(input_var, 3, num_classes)
lr_per_mb = [1.0]*80 + [0.1]*40 + [0.01]
elif network_name == 'resnet110':
z = create_cifar10_model(input_var, 18, num_classes)
lr_per_mb = [0.1]*1 + [1.0]*80 + [0.1]*40 + [0.01]
dtype = np.float16 if fp16 else np.float32
if fp16:
graph_input = C.cast(input_var, dtype=np.float16)
graph_label = C.cast(label_var, dtype=np.float16)
else:
raise RuntimeError("Unknown model name!")
graph_input = input_var
graph_label = label_var
# loss and metric
ce = cross_entropy_with_softmax(z, label_var)
pe = classification_error(z, label_var)
with C.default_options(dtype=dtype):
# create model, and configure learning parameters
if network_name == 'resnet20':
z = create_cifar10_model(graph_input, 3, num_classes)
lr_per_mb = [1.0]*80 + [0.1]*40 + [0.01]
elif network_name == 'resnet110':
z = create_cifar10_model(graph_input, 18, num_classes)
lr_per_mb = [0.1]*1 + [1.0]*80 + [0.1]*40 + [0.01]
else:
raise RuntimeError("Unknown model name!")
# loss and metric
ce = cross_entropy_with_softmax(z, graph_label)
pe = classification_error(z, graph_label)
if fp16:
ce = C.cast(ce, dtype=np.float32)
pe = C.cast(pe, dtype=np.float32)
# shared training parameters
minibatch_size = 128
@ -168,6 +181,7 @@ if __name__=='__main__':
parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
parser.add_argument('-genheartbeat', '--genheartbeat', help="Turn on heart-beat for philly", action='store_true', default=False)
parser.add_argument('-fp16', '--fp16', help="use float16", action='store_true', default=False)
args = vars(parser.parse_args())
epochs = args['epochs']
@ -184,4 +198,4 @@ if __name__=='__main__':
reader_test = create_image_mb_source(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False, total_number_of_samples=C.io.FULL_DATA_SWEEP)
train_and_evaluate(reader_train, reader_test, network_name, epoch_size, epochs, args['profiler_dir'], model_dir,
args['logdir'], args['tensorboard_logdir'], gen_heartbeat=args['genheartbeat'])
args['logdir'], args['tensorboard_logdir'], gen_heartbeat=args['genheartbeat'], fp16=args['fp16'])

Просмотреть файл

@ -37,22 +37,35 @@ num_classes = 10
model_name = "ResNet_CIFAR10_DataAug.model"
# Create network
def create_resnet_network(network_name):
def create_resnet_network(network_name, fp16):
# Input variables denoting the features and label data
input_var = C.input_variable((num_channels, image_height, image_width))
label_var = C.input_variable((num_classes))
# create model, and configure learning parameters
if network_name == 'resnet20':
z = create_cifar10_model(input_var, 3, num_classes)
elif network_name == 'resnet110':
z = create_cifar10_model(input_var, 18, num_classes)
dtype = np.float16 if fp16 else np.float32
if fp16:
graph_input = C.cast(input_var, dtype=np.float16)
graph_label = C.cast(label_var, dtype=np.float16)
else:
return RuntimeError("Unknown model name!")
graph_input = input_var
graph_label = label_var
# loss and metric
ce = cross_entropy_with_softmax(z, label_var)
pe = classification_error(z, label_var)
with C.default_options(dtype=dtype):
# create model, and configure learning parameters
if network_name == 'resnet20':
z = create_cifar10_model(graph_input, 3, num_classes)
elif network_name == 'resnet110':
z = create_cifar10_model(graph_input, 18, num_classes)
else:
return RuntimeError("Unknown model name!")
# loss and metric
ce = cross_entropy_with_softmax(z, graph_label)
pe = classification_error(z, graph_label)
if fp16:
ce = C.cast(ce, dtype=np.float32)
pe = C.cast(pe, dtype=np.float32)
return {
'name' : network_name,
@ -120,7 +133,7 @@ def train_and_test(network, trainer, train_source, test_source, minibatch_size,
# Train and evaluate the network.
def resnet_cifar10(train_data, test_data, mean_data, network_name, epoch_size, num_quantization_bits=32, block_size=None, warm_up=0,
max_epochs=160, restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False, scale_up=False, profiling=False):
max_epochs=160, restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False, scale_up=False, profiling=False, fp16=False):
set_computation_network_trace_level(0)
@ -138,7 +151,7 @@ def resnet_cifar10(train_data, test_data, mean_data, network_name, epoch_size, n
gen_heartbeat=gen_heartbeat,
num_epochs=max_epochs)
network = create_resnet_network(network_name)
network = create_resnet_network(network_name, fp16)
trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer)
train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP)
@ -162,6 +175,7 @@ if __name__=='__main__':
parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true')
parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
parser.add_argument('-profile', '--profile', help="Turn on profiling", action='store_true', default=False)
parser.add_argument('-fp16', '--fp16', help="use float16", action='store_true', default=False)
args = vars(parser.parse_args())
@ -201,7 +215,8 @@ if __name__=='__main__':
restore=not args['restart'],
scale_up=scale_up,
log_to_file=args['logdir'],
profiling=args['profile'])
profiling=args['profile'],
fp16=args['fp16'])
# Must call MPI finalize when process exit without exceptions
Communicator.finalize()

Просмотреть файл

@ -63,32 +63,46 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
multithreaded_deserializer=True)
# Create network
def create_resnet_network(network_name):
def create_resnet_network(network_name, fp16):
# Input variables denoting the features and label data
input_var = C.input_variable((num_channels, image_height, image_width))
label_var = C.input_variable((num_classes))
stride1x1 = (1, 1)
stride3x3 = (2, 2)
# create model, and configure learning parameters
if network_name == 'resnet18':
z = create_imagenet_model_basic(input_var, [2, 1, 1, 2], num_classes)
elif network_name == 'resnet34':
z = create_imagenet_model_basic(input_var, [3, 3, 5, 2], num_classes)
elif network_name == 'resnet50':
z = create_imagenet_model_bottleneck(input_var, [2, 3, 5, 2], num_classes, stride1x1, stride3x3)
elif network_name == 'resnet101':
z = create_imagenet_model_bottleneck(input_var, [2, 3, 22, 2], num_classes, stride1x1, stride3x3)
elif network_name == 'resnet152':
z = create_imagenet_model_bottleneck(input_var, [2, 7, 35, 2], num_classes, stride1x1, stride3x3)
dtype = np.float16 if fp16 else np.float32
if fp16:
graph_input = C.cast(input_var, dtype=np.float16)
graph_label = C.cast(label_var, dtype=np.float16)
else:
return RuntimeError("Unknown model name!")
graph_input = input_var
graph_label = label_var
# loss and metric
ce = cross_entropy_with_softmax(z, label_var)
errs = classification_error(z, label_var, topN=1)
top5Errs = classification_error(z, label_var, topN=5)
with C.default_options(dtype=dtype):
stride1x1 = (1, 1)
stride3x3 = (2, 2)
# create model, and configure learning parameters
if network_name == 'resnet18':
z = create_imagenet_model_basic(graph_input, [2, 1, 1, 2], num_classes)
elif network_name == 'resnet34':
z = create_imagenet_model_basic(graph_input, [3, 3, 5, 2], num_classes)
elif network_name == 'resnet50':
z = create_imagenet_model_bottleneck(graph_input, [2, 3, 5, 2], num_classes, stride1x1, stride3x3)
elif network_name == 'resnet101':
z = create_imagenet_model_bottleneck(graph_input, [2, 3, 22, 2], num_classes, stride1x1, stride3x3)
elif network_name == 'resnet152':
z = create_imagenet_model_bottleneck(graph_input, [2, 7, 35, 2], num_classes, stride1x1, stride3x3)
else:
return RuntimeError("Unknown model name!")
# loss and metric
ce = cross_entropy_with_softmax(z, graph_label)
errs = classification_error(z, graph_label, topN=1)
top5Errs = classification_error(z, graph_label, topN=5)
if fp16:
ce = C.cast(ce, dtype=np.float32)
errs = C.cast(errs, dtype=np.float32)
top5Errs = C.cast(top5Errs, dtype=np.float32)
return {
'name' : network_name,
@ -156,7 +170,7 @@ def train_and_test(network, trainer, train_source, test_source, minibatch_size,
# Train and evaluate the network.
def resnet_imagenet(train_data, test_data, mean_data, network_name, epoch_size, num_quantization_bits=32, block_size=None, warm_up=0,
max_epochs=90, restore=True, log_to_file=None, num_mbs_per_log=100, gen_heartbeat=False, scale_up=False, profiling=False):
max_epochs=90, restore=True, log_to_file=None, num_mbs_per_log=100, gen_heartbeat=False, scale_up=False, profiling=False, fp16=False):
set_computation_network_trace_level(0)
@ -174,7 +188,7 @@ def resnet_imagenet(train_data, test_data, mean_data, network_name, epoch_size,
gen_heartbeat=gen_heartbeat,
num_epochs=max_epochs)
network = create_resnet_network(network_name)
network = create_resnet_network(network_name, fp16)
trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer)
train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP)
@ -198,6 +212,7 @@ if __name__=='__main__':
parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true', default=False)
parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
parser.add_argument('-profile', '--profile', help="Turn on profiling", action='store_true', default=False)
parser.add_argument('-fp16', '--fp16', help="use float16", action='store_true', default=False)
args = vars(parser.parse_args())
@ -237,7 +252,8 @@ if __name__=='__main__':
restore=not args['restart'],
scale_up=scale_up,
log_to_file=args['logdir'],
profiling=args['profile'])
profiling=args['profile'],
fp16=args['fp16'])
# Must call MPI finalize when process exit without exceptions
Communicator.finalize()

Просмотреть файл

@ -11,8 +11,6 @@
<OutputPath>.</OutputPath>
<ProjectTypeGuids>{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
<LaunchProvider>Standard Python launcher</LaunchProvider>
<InterpreterId>{4aa09c20-bd30-41b0-b691-0beca389326c}</InterpreterId>
<InterpreterVersion>3.5</InterpreterVersion>
<EnableNativeCodeDebugging>False</EnableNativeCodeDebugging>
<IsWindowsApplication>False</IsWindowsApplication>
</PropertyGroup>
@ -20,7 +18,6 @@
<PropertyGroup Condition="'$(Configuration)' == 'Release'" />
<PropertyGroup>
<VisualStudioVersion Condition=" '$(VisualStudioVersion)' == '' ">10.0</VisualStudioVersion>
<PtvsTargetsFile>$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets</PtvsTargetsFile>
</PropertyGroup>
<ItemGroup>
<Compile Include="..\bindings\python\cntk\ops\tests\block_test.py">
@ -280,6 +277,9 @@
<ItemGroup>
<Folder Include="1stSteps\" />
<Folder Include="common\" />
<Folder Include="dToEndTests\" />
<Folder Include="dToEndTests\cntkv2Python\" />
<Folder Include="dToEndTests\cntkv2Python\Examples\" />
<Folder Include="Image\" />
<Folder Include="Image\Classification\" />
<Folder Include="Image\Classification\ConvNet\" />
@ -343,10 +343,6 @@
<Folder Include="ToEndTests\cntkv2Python\" />
<Folder Include="ToEndTests\cntkv2Python\Examples\" />
</ItemGroup>
<ItemGroup>
<InterpreterReference Include="{4aa09c20-bd30-41b0-b691-0beca389326c}\3.5" />
<InterpreterReference Include="{c2abc240-2da9-4409-8b09-0ee30a5b56e2}\3.4" />
</ItemGroup>
<ItemGroup>
<Content Include="Text\CharacterLM\data\tinyshakespeare.txt" />
<Content Include="Text\CharacterLM\README.md" />
@ -355,6 +351,5 @@
<Content Include="Text\WordLMWithSampledSoftmax\test\text.txt" />
<Content Include="Text\WordLMWithSampledSoftmax\test\token2id.txt" />
</ItemGroup>
<Import Project="$(PtvsTargetsFile)" Condition="Exists($(PtvsTargetsFile))" />
<Import Project="$(MSBuildToolsPath)\Microsoft.Common.targets" Condition="!Exists($(PtvsTargetsFile))" />
</Project>
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
</Project>

Просмотреть файл

@ -45,7 +45,7 @@
# defaults to /usr/local/mpi
# These can be overridden on the command line, e.g. make BUILDTYPE=debug
# TODO: Build static libraries for common dependencies that are shared by multiple
# TODO: Build static libraries for common dependencies that are shared by multiple
# targets, e.g. eval and CNTK.
ARCH=$(shell uname)
@ -90,8 +90,8 @@ SOURCEDIR:= Source
INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2LibraryDll/API CNTKv2LibraryDll/proto ../Examples/Extensibility/CPP Math CNTK ActionsLib ComputationNetworkLib SGDLib SequenceTrainingLib CNTK/BrainScript Readers/ReaderLib PerformanceProfilerDll)
INCLUDEPATH+=$(PROTOBUF_PATH)/include
# COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
COMMON_FLAGS:= -DHAS_MPI=$(HAS_MPI) -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
CPPFLAGS:=
COMMON_FLAGS:= -DHAS_MPI=$(HAS_MPI) -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11 -DCUDA_NO_HALF -D__CUDA_NO_HALF_OPERATORS__
CPPFLAGS:=
CXXFLAGS:= $(SSE_FLAGS) -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
LIBPATH:=
LIBS_LIST:=
@ -168,10 +168,10 @@ endif
ifeq ("$(MATHLIB)","mkl")
INCLUDEPATH += $(MKL_PATH)/include
LIBS_LIST += m iomp5 pthread mklml_intel
LIBS_LIST += m iomp5 pthread mklml_intel mkldnn
MKL_LIB_PATH := $(MKL_PATH)/lib
LIBPATH += $(MKL_LIB_PATH)
COMMON_FLAGS += -DUSE_MKL
COMMON_FLAGS += -DUSE_MKL -DUSE_MKLDNN
endif
ifeq ($(CUDA_GDR),1)
@ -212,6 +212,7 @@ GENCODE_SM50 := -gencode arch=compute_50,code=\"sm_50,compute_50\"
GENCODE_SM52 := -gencode arch=compute_52,code=\"sm_52,compute_52\"
GENCODE_SM60 := -gencode arch=compute_60,code=\"sm_60,compute_60\"
GENCODE_SM61 := -gencode arch=compute_61,code=\"sm_61,compute_61\"
GENCODE_SM70 := -gencode arch=compute_70,code=\"sm_70,compute_70\"
# Should we relocate *.gcno and *.gcda files using -fprofile-dir option?
# Use GCOV_PREFIX and GCOV_PREFIX_STRIP if relocating:
@ -241,7 +242,7 @@ ifeq ("$(BUILDTYPE)","release")
ifdef CNTK_CUDA_CODEGEN_RELEASE
GENCODE_FLAGS := $(CNTK_CUDA_CODEGEN_RELEASE)
else
GENCODE_FLAGS := $(GENCODE_SM30) $(GENCODE_SM35) $(GENCODE_SM50) $(GENCODE_SM60) $(GENCODE_SM61)
GENCODE_FLAGS := $(GENCODE_SM30) $(GENCODE_SM35) $(GENCODE_SM50) $(GENCODE_SM60) $(GENCODE_SM61) $(GENCODE_SM70)
endif
CXXFLAGS += -g -O4
@ -365,6 +366,10 @@ MATH_SRC =\
$(SOURCEDIR)/Math/CUDAPageLockedMemAllocator.cpp \
$(SOURCEDIR)/Math/CPUMatrixFloat.cpp \
$(SOURCEDIR)/Math/CPUMatrixDouble.cpp \
$(SOURCEDIR)/Math/CPUMatrixHalf.cpp \
$(SOURCEDIR)/Math/CPUMatrixTensorFloat.cpp \
$(SOURCEDIR)/Math/CPUMatrixTensorDouble.cpp \
$(SOURCEDIR)/Math/CPUMatrixTensorHalf.cpp \
$(SOURCEDIR)/Math/CPURNGHandle.cpp \
$(SOURCEDIR)/Math/CPUSparseMatrix.cpp \
$(SOURCEDIR)/Math/ConvolutionEngine.cpp \
@ -415,7 +420,7 @@ $(CNTKMATH_LIB): $(MATH_OBJ) | $(PERF_PROFILER_LIB)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(LIBDIR) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp -l$(PERF_PROFILER)
# Any executable using Common or ReaderLib needs to link these libraries.
# Any executable using Common or ReaderLib needs to link these libraries.
READER_LIBS := $(CNTKMATH_LIB) $(PERF_PROFILER_LIB)
L_READER_LIBS := -l$(CNTKMATH) -l$(PERF_PROFILER)
@ -711,7 +716,7 @@ $(CNTKLIBRARY_CPP_EVAL_EXAMPLES): $(CNTKLIBRARY_CPP_EVAL_EXAMPLES_OBJ) | $(CNTKL
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) $(L_READER_LIBS)
########################################
# Eval V2 Sample test
# Eval V2 Sample test
########################################
CNTKLIBRARY_CPP_EVAL_TEST:=$(BINDIR)/CNTKLibraryCPPEvalExamplesTest
@ -1047,13 +1052,13 @@ ifeq (,$(wildcard Source/1BitSGD/*.h))
$(error Build with 1bit-SGD was requested but cannot find the code. Please check https://docs.microsoft.com/en-us/cognitive-toolkit/Enabling-1bit-SGD for instructions)
endif
INCLUDEPATH += $(SOURCEDIR)/1BitSGD
INCLUDEPATH += $(SOURCEDIR)/1BitSGD
COMMON_FLAGS += -DCNTK_PARALLEL_TRAINING_SUPPORT
# temporarily adding to 1bit, need to work with others to fix it
endif
########################################
# ASGD(multiverso) setup
########################################
@ -1081,7 +1086,7 @@ MULTIVERSO_CMAKE_BUILDTYPE=Debug
endif
# TODO need to align Multiverso OpenMP with the one we use (libiomp). For now, disabled.
$(MULTIVERSO_LIB):
$(MULTIVERSO_LIB):
@echo "Build Multiverso lib"
@mkdir -p $(LIBDIR)
@mkdir -p $(BINDIR)
@ -1322,6 +1327,7 @@ UNITTEST_MATH_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixLearnerTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/HalfGPUTests.cpp \
UNITTEST_MATH_SRC += $(CNTK_COMMON_SRC)
UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))
@ -1530,13 +1536,13 @@ $(OBJDIR)/%.o : %.cu $(BUILD_CONFIGURATION)
@mkdir -p $(dir $@)
$(NVCC) -c $< -o $@ $(COMMON_FLAGS) $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler "-fPIC -Werror"
$(OBJDIR)/%.pb.o : %.pb.cc $(BUILD_CONFIGURATION)
$(OBJDIR)/%.pb.o : %.pb.cc $(BUILD_CONFIGURATION)
@echo $(SEPARATOR)
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
@mkdir -p $(dir $@)
$(CXX) -c $< -o $@ $(COMMON_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}
$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
@echo $(SEPARATOR)
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
@mkdir -p $(dir $@)

Просмотреть файл

@ -2,6 +2,32 @@
## Latest news
***2018-01-22.*** CNTK support for CUDA 9
CNTK now supports CUDA 9/cuDNN 7. This requires an update to build environment to Ubuntu 16/GCC 5 for Linux, and Visual Studio 2017/VCTools 14.11 for Windows. With CUDA 9, CNTK also added a preview for 16-bit floating point (a.k.a FP16) computation.
Please check out the example of FP16 in ResNet50 [here](./Examples/Image/Classification/ResNet/Python/TrainResNet_ImageNet_Distributed.py)
Notes on FP16 preview:
* FP16 implementation on CPU is not optimized, and it's not supposed to be used in CPU inference directly. User needs to convert the model to 32-bit floating point before running on CPU.
* Loss/Criterion for FP16 training needs to be 32bit for accumulation without overflow, using cast function. Please check the example above.
* Readers do not have FP16 output unless using numpy to feed data, cast from FP32 to FP16 is needed. Please check the example above.
* FP16 gradient aggregation is currently only implemented on GPU using NCCL2. Distributed training with FP16 with MPI is not supported.
* FP16 math is a subset of current FP32 implementation. Some model may get Feature Not Implemented exception using FP16.
* FP16 is currently not supported in BrainScript. Please use Python for FP16.
To setup build and runtime environment on Windows:
* Install [Visual Studio 2017](https://www.visualstudio.com/downloads/) with following workloads and components. From command line (use Community version installer as example):
vs_community.exe --add Microsoft.VisualStudio.Workload.NativeDesktop --add Microsoft.VisualStudio.Workload.ManagedDesktop --add Microsoft.VisualStudio.Workload.Universal --add Microsoft.Component.PythonTools --add Microsoft.VisualStudio.Component.VC.Tools.14.11
* Install [NVidia CUDA 9](https://developer.nvidia.com/cuda-90-download-archive?target_os=Windows&target_arch=x86_64)
* From PowerShell, run:
[DevInstall.ps1](./Tools/devInstall/Windows/DevInstall.ps1)
* Start VCTools 14.11 command line, run:
cmd /k "%VS2017INSTALLDIR%\VC\Auxiliary\Build\vcvarsall.bat" x64 --vcvars_ver=14.11
* Open [CNTK.sln](./CNTK.sln) from the VCTools 14.11 command line. Note that starting CNTK.sln other than VCTools 14.11 command line, would causes CUDA 9 [build error](https://developercommunity.visualstudio.com/content/problem/163758/vs-2017-155-doesnt-support-cuda-9.html).
To setup build and runtime environment on Linux using docker, please build Unbuntu 16.04 docker image using Dockerfiles [here](./Tools/docker). For other Linux systems, please refer to the Dockerfiles to setup dependent libraries for CNTK.
***2017-12-05.* CNTK 2.3.1**
Release of Cognitive Toolkit v.2.3.1.

@ -1 +1 @@
Subproject commit 70d0c096a11e3b370d1b529f098f9711a7966090
Subproject commit fb72ecb8c4d4e1d2e59b200ad1092331a372edea

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
@ -140,4 +140,4 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>
</Project>

Просмотреть файл

@ -291,7 +291,7 @@ public:
// map all to lowercase
std::wstring lvalue = std::wstring(value.begin(), value.end());
std::transform(lvalue.begin(), lvalue.end(), lvalue.begin(), ::tolower); // note: may crash for chars >127. Don't use those.
std::transform(lvalue.begin(), lvalue.end(), lvalue.begin(), [](wchar_t c) { return (wchar_t)::tolower(c); }); // note: may crash for chars >127. Don't use those.
// add to the respective node group
m_net->AddToNodeGroup(lvalue, compNode);

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
@ -33,13 +33,11 @@
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>

Просмотреть файл

@ -31,6 +31,7 @@
#endif
#include "CNTKLibraryInternals.h"
#include "HalfConverter.hpp"
// undef max in the rest of the file to avoid conflicts with the max macro defined in windows.h.
#pragma push_macro("max")
@ -38,6 +39,43 @@
namespace CNTK
{
class float16
{
protected:
unsigned short __x;
public:
float16() = default;
float16(const float16& other) { __x = other.__x; }
#ifndef SWIG
// construction from build-in types
float16(float f) { floatToFloat16(&f, &__x); }
float16(double d) : float16((float)d) {}
float16(int i) : float16((float)i) {}
float16(size_t u) : float16((float)u) {}
// cast to build-in types
operator float() const { float f; float16ToFloat(&__x, &f); return f; }
// compare functions
inline bool operator==(const float16& rhs) const { return (__x == rhs.__x); }
inline bool operator!=(const float16& rhs) const { return (__x != rhs.__x); }
#endif
static float16 create(float f)
{
float16 v;
floatToFloat16(&f, &v.__x);
return v;
}
static float16 create(double d)
{
return create((float)d);
}
};
///
/// Enumeration type denoting data type of symbolic data entities or actual data.
///
@ -47,6 +85,7 @@ namespace CNTK
Float = 1,
Double = 2,
UChar = 3, // So far only used internally in deserializers.
Float16 = 4,
/* TODO:
Bit,
@ -74,6 +113,8 @@ namespace CNTK
return DataType::Float;
else if (std::is_same<ElementType, double>())
return DataType::Double;
else if (std::is_same<ElementType, float16>())
return DataType::Float16;
else
NOT_IMPLEMENTED;
}
@ -84,6 +125,8 @@ namespace CNTK
return "Float";
else if (dataType == DataType::Double)
return "Double";
else if (dataType == DataType::Float16)
return "Float16";
else
LogicError("Unknown DataType.");
}
@ -94,6 +137,8 @@ namespace CNTK
return sizeof(float);
else if (dataType == DataType::Double)
return sizeof(double);
else if (dataType == DataType::Float16)
return sizeof(float16);
else
LogicError("Unknown DataType.");
}
@ -621,8 +666,7 @@ namespace CNTK
/// Construct a NDArrayView with newly allocated sparse storage in SparseCSC format on the specified 'device' and initialize its contents
/// with the specified Sparse CSC format data.
///
template <typename ElementType>
CNTK_API NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly = false);
CNTK_API NDArrayView(::CNTK::DataType dataType, const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const void* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly = false);
///
/// Construct a NDArrayView over newly allocated storage in the specified format on the specified 'device'.
@ -707,6 +751,9 @@ namespace CNTK
case DataType::Double:
SetValue(value);
break;
case DataType::Float16:
SetValue(float16::create(value));
break;
default:
LogicError("Unsupported DataType %s.", DataTypeName(m_dataType));
break;
@ -801,6 +848,11 @@ namespace CNTK
///
CNTK_API void SetValue(double value);
///
/// Fill 'this' NDArrayView with the specified value. The underlying DataType of 'this' view should be DataType::Double.
///
CNTK_API void SetValue(float16 value);
///
/// Creates a new NDArrayView with newly allocated storage on the specified device and copies 'this' view's contents into the newly allocated view.
///
@ -876,6 +928,25 @@ namespace CNTK
// Disallow copy and move construction and assignment
NDArrayView(const NDArrayView&) = delete; NDArrayView& operator=(const NDArrayView&) = delete; NDArrayView& operator=(NDArrayView&&) = delete; NDArrayView(NDArrayView&& other) = delete;
// template functions connecting V1ElemType and ElementType
template <typename ElementType, typename V1ElemType>
const ElementType* _DataBuffer() const;
template <typename ElementType, typename V1ElemType>
std::tuple<const ElementType *, const SparseIndexType*, const SparseIndexType*, size_t> _SparseCSCDataBuffers() const;
template <typename ElementType, typename V1ElemType>
std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> _SparseBlockColumnDataBuffers() const;
template <typename ElementType, typename V1ElemType>
static NDArrayViewPtr _RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device);
template <typename ElementType, typename V1ElemType>
static NDArrayViewPtr _RandomUniform(const NDShape& shape, double rangeStart, double rangeEnd, unsigned long seed, const DeviceDescriptor& device);
template<typename ElementType, typename V1ElemType>
ElementType _AsScalar() const;
private:
static const size_t AutoSelectRowColSplitPoint = SIZE_MAX;
@ -891,6 +962,10 @@ namespace CNTK
template <typename ElementType>
std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetWritableMatrix(size_t rowColSplitPoint = AutoSelectRowColSplitPoint);
std::shared_ptr<const Microsoft::MSR::CNTK::MatrixBase> GetMatrixBase(size_t rowColSplitPoint = AutoSelectRowColSplitPoint) const;
std::shared_ptr<Microsoft::MSR::CNTK::MatrixBase> GetWritableMatrixBase(size_t rowColSplitPoint = AutoSelectRowColSplitPoint);
template <typename ElementType>
const Microsoft::MSR::CNTK::TensorView<ElementType>* GetTensorView() const;
@ -2836,6 +2911,10 @@ namespace CNTK
{
CopyVariableValueToVector<double>(outputVariable, sequences);
}
else if (dataType == DataType::Float16)
{
CopyVariableValueToVector<float16>(outputVariable, sequences);
}
}
///
@ -6385,6 +6464,16 @@ namespace CNTK
///
CNTK_API FunctionPtr Crop(const Variable& nodeInput, const Variable& nodeReferent, const Variable& ancestorInput, const Variable& ancestorReferent, const std::wstring& name = L"");
///
/// Creates an instance of crop node with automatically computed crop offsets and specified ancestor nodes.
/// This is used in cases when input nodes do not have common ancestor in the network.
/// nodeInput: input node to be cropped.
/// nodeReferent: input node which determines the spatial size of output.
/// ancestorInput: ancestor of nodeInput.
/// ancestorReferent: ancestor of nodeReferent which is treated as equal to ancestorInput for the purpose of computing crop offsets.
///
CNTK_API FunctionPtr Cast(const Variable& nodeInput, DataType outputType, const std::wstring& name = L"");
#endif // !CNTK_HEADERONLY_DEFINITIONS
}

Просмотреть файл

@ -41,6 +41,8 @@
// Forward declarations
namespace Microsoft { namespace MSR { namespace CNTK {
struct MatrixBase;
template <typename ElemType>
class Matrix;

Просмотреть файл

@ -0,0 +1,94 @@
//This code is shared between V1 and V2
#pragma once
namespace CNTK {
// Host functions for converting between FP32 and FP16 formats
// TODO: use f16c instructions if available
inline void float16ToFloat(const unsigned short* src, float* res)
{
unsigned h = *src;
unsigned sign = ((h >> 15) & 1);
unsigned exponent = ((h >> 10) & 0x1f);
unsigned mantissa = ((h & 0x3ff) << 13);
if (exponent == 0x1f) { /* NaN or Inf */
mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
exponent = 0xff;
}
else if (!exponent) { /* Denorm or Zero */
if (mantissa) {
unsigned int msb;
exponent = 0x71;
do {
msb = (mantissa & 0x400000);
mantissa <<= 1; /* normalize */
--exponent;
} while (!msb);
mantissa &= 0x7fffff; /* 1.mantissa is implicit */
}
}
else {
exponent += 0x70;
}
*(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa);
}
inline void floatToFloat16(float* src, unsigned short* dest)
{
unsigned x = *(unsigned*)src;
unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
unsigned short sign;
unsigned exponent, mantissa;
// Get rid of +NaN/-NaN case first.
if (u > 0x7f800000) {
*dest = 0x7fffU;
return;
}
sign = (unsigned short)((x >> 16) & 0x8000);
// Get rid of +Inf/-Inf, +0/-0.
if (u > 0x477fefff) {
*dest = sign | 0x7c00U;
return;
}
if (u < 0x33000001) {
*dest = (sign | 0x0000);
return;
}
exponent = ((u >> 23) & 0xff);
mantissa = (u & 0x7fffff);
if (exponent > 0x70) {
shift = 13;
exponent -= 0x70;
}
else {
shift = 0x7e - exponent;
exponent = 0;
mantissa |= 0x800000;
}
lsb = (1 << shift);
lsb_s1 = (lsb >> 1);
lsb_m1 = (lsb - 1);
// Round to nearest even.
remainder = (mantissa & lsb_m1);
mantissa >>= shift;
if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
++mantissa;
if (!(mantissa & 0x3ff)) {
++exponent;
mantissa = 0;
}
}
*dest = (sign | (unsigned short)((exponent << 10) | mantissa));
}
}

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
@ -40,7 +40,6 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
@ -161,6 +160,7 @@
<ClInclude Include="API\CNTKLibraryC.h" />
<ClInclude Include="API\CNTKLibraryExperimental.h" />
<ClInclude Include="API\CNTKLibraryInternals.h" />
<ClInclude Include="API\HalfConverter.hpp" />
<ClInclude Include="BackCompat.h" />
<ClInclude Include="BlockFunction.h" />
<ClInclude Include="CompositeFunction.h" />

Просмотреть файл

@ -177,6 +177,9 @@
<ClInclude Include="proto\onnx\core\utils.h">
<Filter>proto\onnx\core</Filter>
</ClInclude>
<ClInclude Include="API\HalfConverter.hpp">
<Filter>API</Filter>
</ClInclude>
<ClInclude Include="proto\onnx\core\record.h">
<Filter>proto\onnx\core</Filter>
</ClInclude>
@ -249,4 +252,4 @@
<Filter>proto\onnx\protobuf</Filter>
</Proto>
</ItemGroup>
</Project>
</Project>

Просмотреть файл

@ -966,7 +966,7 @@ namespace CNTK
#endif
#ifdef _CUDA_PATH_
int cudaVersion = 0;
if (cudaDriverGetVersion(&cudaVersion) == cudaSuccess)
if (cudaRuntimeGetVersion(&cudaVersion) == cudaSuccess)
{
int major = 0, minor = 0, patchLevel = 0;
ExtractCUDAVersion(cudaVersion, major, minor, patchLevel);

Просмотреть файл

@ -407,6 +407,36 @@ namespace CNTK
}
}
template <typename ElementType>
/*static*/ Microsoft::MSR::CNTK::ComputationNodeBasePtr CompositeFunction::CreateLearnableParameterFromVariable(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, const NDShape& shape, const std::wstring& name)
{
switch (variable.GetDataType())
{
case DataType::Float:
return builder.template TypedCreateLearnableParameter<float>(name, AsTensorShape(shape));
case DataType::Double:
return builder.template TypedCreateLearnableParameter<double>(name, AsTensorShape(shape));
case DataType::Float16:
return builder.template TypedCreateLearnableParameter<half>(name, AsTensorShape(shape));
default:
return builder.CreateLearnableParameter(name, AsTensorShape(shape));
}
}
/*static*/ void CompositeFunction::CastAssignNodeValue(ComputationNodeBasePtr node, DataType dataType, std::shared_ptr<const MatrixBase> matrix)
{
switch (dataType)
{
case DataType::Float:
return (dynamic_cast<ComputationNode<float>*>(&*node))->Value().CastAssignValuesOf(*matrix);
case DataType::Double:
return (dynamic_cast<ComputationNode<double>*>(&*node))->Value().CastAssignValuesOf(*matrix);
case DataType::Float16:
return (dynamic_cast<ComputationNode<half>*>(&*node))->Value().CastAssignValuesOf(*matrix);
default:
LogicError("Unsupported data type");
}
}
// Recursively create a sub-network of ComputationNode instances corresponding to the graph of Functions
// underlying the specified 'variable' and return the ComputationNode instance that corresponds to the
@ -441,33 +471,77 @@ namespace CNTK
// Lets add a null entry in the map for this variable, to break infinite recursion when processing recurrent graphs
variableToNodeMap[variable] = nullptr;
std::shared_ptr<ComputationNode<ElementType>> computationNodePtr;
std::shared_ptr<ComputationNodeBase> computationNodePtr;
auto internalNodeName = CNTKInternalNodeNameFromUidAndName(variable.Uid(), variable.Name(), useMangledNamesForComputationNodes);
if (variable.IsParameter() || variable.IsConstant())
{
if (variable.Shape().HasInferredDimension())
InvalidArgument("Parameter or Constant '%S' with unresolved shape %S found when compiling the Function graph.", variable.AsString().c_str(), variable.Shape().AsString().c_str());
computationNodePtr = builder.CreateLearnableParameter(internalNodeName, AsTensorShape(variable.Shape()));
computationNodePtr = CreateLearnableParameterFromVariable(variable, builder, variable.Shape(), internalNodeName);
network->InitLearnableParameters(computationNodePtr, L"fixedValue", 0); // must call this to follow protocol; can overwrite later
if (!variable.NeedsGradient() || (inputsToExcludeGradientsFor.find(variable) != inputsToExcludeGradientsFor.end()))
computationNodePtr->SetLearningRateMultiplier(0.0);
NDArrayViewPtr value = variable.IsConstant() ? Constant(variable).Value() : Parameter(variable).Value();
std::shared_ptr<const Matrix<ElementType>> valueMatrix = variable.IsConstant() ? value->GetMatrix<ElementType>() : value->GetWritableMatrix<ElementType>();
std::shared_ptr<const MatrixBase> valueMatrix = variable.IsConstant() ? value->GetMatrixBase() : value->GetWritableMatrixBase();
if (variable.IsParameter() || (valueMatrix->GetDeviceId() == network->GetDeviceId()))
computationNodePtr->Value() = valueMatrix->AsReference();
{
// shallow copy from parameter value to computation node value to link them together
switch (variable.GetDataType())
{
case DataType::Float:
std::dynamic_pointer_cast<ComputationNode<float>>(computationNodePtr)->Value() = std::dynamic_pointer_cast<const Matrix<float>>(valueMatrix)->AsReference();
break;
case DataType::Double:
std::dynamic_pointer_cast<ComputationNode<double>>(computationNodePtr)->Value() = std::dynamic_pointer_cast<const Matrix<double>>(valueMatrix)->AsReference();
break;
case DataType::Float16:
std::dynamic_pointer_cast<ComputationNode<half>>(computationNodePtr)->Value() = std::dynamic_pointer_cast<const Matrix<half>>(valueMatrix)->AsReference();
break;
default:
LogicError("Unsupported data type");
}
}
else // Constant: if initialized data lives on wrong device, make a copy to the right one (copy is OK since it's constant)
{
// TODO: the following two lines are a workaround for a bug in the Math library
// (AssignValuesOf throws when source and destination matrices reside on different GPU devices).
// Once this bug is fixed, change to
// Matrix<ElementType> clonedMatrix(valueMatrix->GetNumRows(), valueMatrix->GetNumCols(), network->GetDeviceId(), valueMatrix->GetMatrixType(), valueMatrix->GetFormat());
Matrix<ElementType> clonedMatrix(network->GetDeviceId());
clonedMatrix.SwitchToMatrixType(valueMatrix->GetMatrixType(), valueMatrix->GetFormat(), false);
clonedMatrix.AssignValuesOf(*valueMatrix);
computationNodePtr->Value() = std::move(clonedMatrix);
switch (variable.GetDataType())
{
case DataType::Float:
{
Matrix<float>& nodeValue = dynamic_cast<ComputationNode<float>*>(&*computationNodePtr)->Value();
Matrix<float> clonedMatrix(nodeValue.GetNumRows(), nodeValue.GetNumCols(), valueMatrix->GetDeviceId(), nodeValue.GetMatrixType(), nodeValue.GetFormat());
clonedMatrix.CastAssignValuesOf(*valueMatrix);
clonedMatrix.TransferToDeviceIfNotThere(network->GetDeviceId(), true);
nodeValue = std::move(clonedMatrix);
break;
}
case DataType::Double:
{
Matrix<double>& nodeValue = dynamic_cast<ComputationNode<double>*>(&*computationNodePtr)->Value();
Matrix<double> clonedMatrix(nodeValue.GetNumRows(), nodeValue.GetNumCols(), valueMatrix->GetDeviceId(), nodeValue.GetMatrixType(), nodeValue.GetFormat());
clonedMatrix.CastAssignValuesOf(*valueMatrix);
clonedMatrix.TransferToDeviceIfNotThere(network->GetDeviceId(), true);
nodeValue = std::move(clonedMatrix);
break;
}
case DataType::Float16:
{
Matrix<half>& nodeValue = dynamic_cast<ComputationNode<half>*>(&*computationNodePtr)->Value();
Matrix<half> clonedMatrix(nodeValue.GetNumRows(), nodeValue.GetNumCols(), valueMatrix->GetDeviceId(), nodeValue.GetMatrixType(), nodeValue.GetFormat());
clonedMatrix.CastAssignValuesOf(*valueMatrix);
clonedMatrix.TransferToDeviceIfNotThere(network->GetDeviceId(), true);
nodeValue = std::move(clonedMatrix);
break;
}
default:
LogicError("Unsupported data type");
}
}
}
else if (variable.IsInput())
@ -515,7 +589,7 @@ namespace CNTK
}
else
{
computationNodePtr = builder.CreateLearnableParameter(internalNodeName, AsTensorShape(fullyDefinedArgumentVar.Shape()));
computationNodePtr = CreateLearnableParameterFromVariable(variable, builder, fullyDefinedArgumentVar.Shape(), internalNodeName);
network->InitLearnableParameters(computationNodePtr, L"fixedValue", 0); // must call this to follow protocol; can overwrite later
if (!variable.NeedsGradient() || (inputsToExcludeGradientsFor.find(variable) != inputsToExcludeGradientsFor.end()))
computationNodePtr->SetLearningRateMultiplier(0.0);
@ -538,7 +612,7 @@ namespace CNTK
// Can be null in case of loops with f.output == f.input.
// Such loops cannot be handled, so we leave nullptr as computational node.
if (outputVariableNode)
computationNodePtr = outputVariableNode->template As<ComputationNode<ElementType>>()->shared_from_this();
computationNodePtr = outputVariableNode->template As<ComputationNodeBase>()->shared_from_this();
else
computationNodePtr = nullptr;
}
@ -589,10 +663,10 @@ namespace CNTK
return mappingVariable;
}
template <typename ElementType>
template<typename ElementType>
/*static*/ ComputationNodeBasePtr CompositeFunction::CreateComputationNode(const Variable& variable,
Function* function,
const std::vector<std::shared_ptr<ComputationNode<ElementType>>>& inputNodes,
const std::vector<std::shared_ptr<ComputationNodeBase>>& inputNodes,
Microsoft::MSR::CNTK::ComputationNetworkPtr& network,
std::unordered_map<Variable, ComputationNodeBasePtr>& variableToNodeMap,
bool useMangledNamesForComputationNodes)
@ -609,6 +683,40 @@ namespace CNTK
for (auto inputNode : inputNodes)
inputNodesBasePtrs.push_back(inputNode);
// take the dataType from the first input, if not specified (i.e. placeholder) then use default
// node like BatchNormalization may have inputs with different precision,
// and that validation is done in specific node constructor
DataType inputNodeType = AsDataType<ElementType>();
if (inputNodes.size() > 0)
{
if (std::dynamic_pointer_cast<ComputationNode<float>, ComputationNodeBase>(inputNodes[0]))
inputNodeType = DataType::Float;
else if (std::dynamic_pointer_cast<ComputationNode<double>, ComputationNodeBase>(inputNodes[0]))
inputNodeType = DataType::Double;
else if (std::dynamic_pointer_cast<ComputationNode<half>, ComputationNodeBase>(inputNodes[0]))
inputNodeType = DataType::Float16;
}
#define ASSIGN_NEW_NODE(nodeClass, ...) \
do { \
if (inputNodeType == DataType::Float) \
computationNodePtr = New<nodeClass<float>>(__VA_ARGS__); \
else if (inputNodeType == DataType::Double) \
computationNodePtr = New<nodeClass<double>>(__VA_ARGS__); \
else if (inputNodeType == DataType::Float16) \
computationNodePtr = New<nodeClass<half>>(__VA_ARGS__); \
} while(0)
#define ASSIGN_NEW_NODE2(nodeClass, dtype, ...) \
do { \
if (inputNodeType == DataType::Float) \
computationNodePtr = New<nodeClass<dtype, float>>(__VA_ARGS__); \
else if (inputNodeType == DataType::Double) \
computationNodePtr = New<nodeClass<dtype, double>>(__VA_ARGS__); \
else if (inputNodeType == DataType::Float16) \
computationNodePtr = New<nodeClass<dtype, half>>(__VA_ARGS__); \
} while(0)
auto outputs = function->RawOutputs();
if (variable == outputs[0])
{
@ -621,76 +729,76 @@ namespace CNTK
switch (op)
{
case PrimitiveOpType::Negate:
computationNodePtr = New<NegateNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(NegateNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Sigmoid:
computationNodePtr = New<SigmoidNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(SigmoidNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Atanh:
computationNodePtr = New<AtanhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(AtanhNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Tanh:
computationNodePtr = New<TanhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(TanhNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Acos:
computationNodePtr = New<AcosNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(AcosNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Cos:
computationNodePtr = New<CosineNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(CosineNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Asin:
computationNodePtr = New<AsinNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(AsinNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Sin:
computationNodePtr = New<SinNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(SinNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Cosh:
computationNodePtr = New<CoshNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(CoshNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Asinh:
computationNodePtr = New<AsinhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(AsinhNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Sinh:
computationNodePtr = New<SinhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(SinhNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::ReLU:
computationNodePtr = New<RectifiedLinearNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(RectifiedLinearNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Exp:
computationNodePtr = New<ExpNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(ExpNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Log:
computationNodePtr = New<LogNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(LogNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Sqrt:
computationNodePtr = New<SqrtNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(SqrtNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::ELU:
computationNodePtr = New<ExponentialLinearUnitNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(ExponentialLinearUnitNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Floor:
computationNodePtr = New<FloorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(FloorNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Abs:
computationNodePtr = New<AbsNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(AbsNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Reciprocal:
computationNodePtr = New<ReciprocalNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(ReciprocalNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Softmax:
computationNodePtr = New<SoftmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(SoftmaxNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Hardmax:
computationNodePtr = New<HardmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(HardmaxNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::TopK:
{
auto k = functionConfig[PrimitiveFunction::AttributeNameNumItems].Value<size_t>();
computationNodePtr = New<TopKNode<ElementType>>(network->GetDeviceId(), internalNodeName, k);
ASSIGN_NEW_NODE(TopKNode, network->GetDeviceId(), internalNodeName, k);
break;
}
case PrimitiveOpType::StableSigmoid:
computationNodePtr = New<StableSigmoidNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(StableSigmoidNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::TransposeAxes:
{
@ -699,7 +807,7 @@ namespace CNTK
auto perm = AsVector<Axis>(functionConfig[PrimitiveFunction::AttributeNameAxisVec].Value<std::vector<DictionaryValue>>());
for (auto& p : perm)
p = NormalizeStaticAxis(p, perm.size());
computationNodePtr = New<TransposeDimensionsNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(perm));
ASSIGN_NEW_NODE(TransposeDimensionsNode, network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(perm));
}
else
{
@ -707,7 +815,7 @@ namespace CNTK
auto axis2 = functionConfig[PrimitiveFunction::AttributeNameAxis2].Value<Axis>();
// The axis ids passed to the internal CNTK TransposeDimensionsNode are 1 based instead of 0 based
computationNodePtr = New<TransposeDimensionsNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(axis1), AsCNTKInternalAxisIdx(axis2));
ASSIGN_NEW_NODE(TransposeDimensionsNode, network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(axis1), AsCNTKInternalAxisIdx(axis2));
}
break;
}
@ -715,24 +823,24 @@ namespace CNTK
{
auto dynamicAxes = variable.DynamicAxes();
auto internalCNTKWhereNodeDynamicAxisName = InternalDynamicAxisNameFromDynamicAxes(dynamicAxes);
computationNodePtr = New<WhereNode<ElementType>>(network->GetDeviceId(), internalNodeName, internalCNTKWhereNodeDynamicAxisName);
ASSIGN_NEW_NODE(WhereNode, network->GetDeviceId(), internalNodeName, internalCNTKWhereNodeDynamicAxisName);
break;
}
case PrimitiveOpType::ToSequence:
{
auto dynamicAxes = variable.DynamicAxes();
auto internalCNTKDynamicAxisName = InternalDynamicAxisNameFromDynamicAxes(dynamicAxes);
computationNodePtr = New<ToSequenceNode<ElementType>>(network->GetDeviceId(), internalNodeName, internalCNTKDynamicAxisName);
ASSIGN_NEW_NODE(ToSequenceNode, network->GetDeviceId(), internalNodeName, internalCNTKDynamicAxisName);
break;
}
case PrimitiveOpType::ToSequenceLike:
computationNodePtr = New<ToSequenceLikeNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(ToSequenceLikeNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::UnpackSequence:
{
auto paddingValue = functionConfig[PrimitiveFunction::AttributeNameSequenceUnpackPaddingValue].Value<double>();
auto suppressMaskOutput = functionConfig[PrimitiveFunction::AttributeNameSequenceUnpackSuppressMaskOutput].Value<bool>();
computationNodePtr = New<UnpackSequenceNode<ElementType>>(network->GetDeviceId(), internalNodeName, (ElementType)paddingValue, suppressMaskOutput);
ASSIGN_NEW_NODE(UnpackSequenceNode, network->GetDeviceId(), internalNodeName, paddingValue, suppressMaskOutput);
break;
}
case PrimitiveOpType::Slice:
@ -768,28 +876,28 @@ namespace CNTK
RuntimeError("Failed to create computation node: Slice operation with inconsistent attributes");
}
// Internal CNTK SliceNode takes 1 based axis indices instead of 0 based
computationNodePtr = New<SliceNode<ElementType>>(network->GetDeviceId(), internalNodeName, beginIndex, endIndex, AsCNTKInternalAxisIdx(axis), strides);
ASSIGN_NEW_NODE(SliceNode, network->GetDeviceId(), internalNodeName, beginIndex, endIndex, AsCNTKInternalAxisIdx(axis), strides);
break;
}
case PrimitiveOpType::RandomSample:
{
auto numSamples = functionConfig[PrimitiveFunction::AttributeNameNumSamples].Value<size_t>();
auto allowDuplicates = functionConfig[PrimitiveFunction::AttributeNameAllowDuplicates].Value<bool>();
computationNodePtr = New<RandomSampleNode<ElementType>>(network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
ASSIGN_NEW_NODE(RandomSampleNode, network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
break;
}
case PrimitiveOpType::RandomSampleInclusionFrequency:
{
auto numSamples = functionConfig[PrimitiveFunction::AttributeNameNumSamples].Value<size_t>();
auto allowDuplicates = functionConfig[PrimitiveFunction::AttributeNameAllowDuplicates].Value<bool>();
computationNodePtr = New<RandomSampleInclusionFrequencyNode<ElementType>>(network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
ASSIGN_NEW_NODE(RandomSampleInclusionFrequencyNode, network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
break;
}
case PrimitiveOpType::Dropout:
{
auto dropoutRate = functionConfig[PrimitiveFunction::AttributeNameDropoutRate].Value<double>();
computationNodePtr = New<DropoutNode<ElementType>>(network->GetDeviceId(), internalNodeName);
computationNodePtr->As<DropoutNode<ElementType>>()->SetDropoutRate(dropoutRate);
ASSIGN_NEW_NODE(DropoutNode, network->GetDeviceId(), internalNodeName);
SMART_NODE_INVOKE(DropoutNode, computationNodePtr, SetDropoutRate, dropoutRate);
break;
}
case PrimitiveOpType::RandomDistribution:
@ -805,11 +913,11 @@ namespace CNTK
if (functionConfig.Contains(PrimitiveFunction::AttributeNameNewShape))
{
auto shape = functionConfig[PrimitiveFunction::AttributeNameNewShape].Value<NDShape>();
computationNodePtr = New<RandomDistributionNode<ElementType>>(network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs, AsTensorShape(shape));
ASSIGN_NEW_NODE(RandomDistributionNode, network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs, AsTensorShape(shape));
}
else
computationNodePtr = New<RandomDistributionNode<ElementType>>(network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs);
computationNodePtr->As<RandomDistributionNode<ElementType>>()->SetRngState(seed, offset);
ASSIGN_NEW_NODE(RandomDistributionNode, network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs);
SMART_NODE_INVOKE(RandomDistributionNode, computationNodePtr, SetRngState, seed, offset);
break;
}
case PrimitiveOpType::Reshape:
@ -829,7 +937,7 @@ namespace CNTK
replacementShape[i] = 0;
}
computationNodePtr = New<ReshapeNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(replacementShape), AsCNTKInternalAxisIdx(beginAxis), AsCNTKInternalAxisIdx(endAxis));
ASSIGN_NEW_NODE(ReshapeNode, network->GetDeviceId(), internalNodeName, AsTensorShape(replacementShape), AsCNTKInternalAxisIdx(beginAxis), AsCNTKInternalAxisIdx(endAxis));
break;
}
case PrimitiveOpType::Squeeze:
@ -853,7 +961,7 @@ namespace CNTK
PoolingType poolingType = (PoolingType)(functionConfig[PrimitiveFunction::AttributeNamePoolingType].Value<size_t>());
auto roiOutputShape = functionConfig[PrimitiveFunction::AttributeNameROIOutputShape].Value<NDShape>();
auto spatialScale = functionConfig[PrimitiveFunction::AttributeNameSpatialScale].Value<double>();
computationNodePtr = New<ROIPoolingNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(roiOutputShape), spatialScale);
ASSIGN_NEW_NODE(ROIPoolingNode, network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(roiOutputShape), spatialScale);
break;
}
case PrimitiveOpType::Pooling:
@ -874,7 +982,7 @@ namespace CNTK
{
includePad = functionConfig[PrimitiveFunction::AttributeNameIncludePad].Value<bool>();
}
computationNodePtr = New<PoolingNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ceilOutDim, includePad, ImageLayoutKind::CHW);
ASSIGN_NEW_NODE(PoolingNode, network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ceilOutDim, includePad, ImageLayoutKind::CHW);
break;
}
case PrimitiveOpType::Unpooling:
@ -885,77 +993,77 @@ namespace CNTK
auto upperPad = functionConfig[PrimitiveFunction::AttributeNameUpperPad].Value<NDShape>();
auto autoPadding = AsVector<bool>(functionConfig[PrimitiveFunction::AttributeNameAutoPadding].Value<std::vector<DictionaryValue>>());
//We only get here after validation so it is safe to assume unpooling is max
computationNodePtr = New<MaxUnpoolingNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(unpoolingWindowShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ImageLayoutKind::CHW);
ASSIGN_NEW_NODE(MaxUnpoolingNode, network->GetDeviceId(), internalNodeName, AsTensorShape(unpoolingWindowShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ImageLayoutKind::CHW);
break;
}
case PrimitiveOpType::SumAll:
computationNodePtr = New<SumElementsNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(SumElementsNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::OneHot:
{
auto numClass = functionConfig[PrimitiveFunction::AttributeNameNumClass].Value<size_t>();
auto is_sparse = functionConfig[PrimitiveFunction::AttributeNameOneHotOutputSparse].Value<bool>();
auto axis = functionConfig[PrimitiveFunction::AttributeNameOneHotAxis].Value<Axis>();
computationNodePtr = New<OneHotNode<ElementType>>(network->GetDeviceId(), numClass, is_sparse, axis.StaticAxisIndex(), internalNodeName);
ASSIGN_NEW_NODE(OneHotNode, network->GetDeviceId(), numClass, is_sparse, axis.StaticAxisIndex(), internalNodeName);
break;
}
case PrimitiveOpType::Gather:
computationNodePtr = New<GatherNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(GatherNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::ToBatch:
{
computationNodePtr = New<ToBatchAxisNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(ToBatchAxisNode, network->GetDeviceId(), internalNodeName);
break;
}
case PrimitiveOpType::UnpackBatch:
{
computationNodePtr = New<UnpackBatchAxisNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(UnpackBatchAxisNode, network->GetDeviceId(), internalNodeName);
break;
}
case PrimitiveOpType::Plus:
computationNodePtr = New<PlusNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(PlusNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::LogPlus:
computationNodePtr = New<LogPlusNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(LogPlusNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Pow:
computationNodePtr = New<PowNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(PowNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Minus:
computationNodePtr = New<MinusNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(MinusNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::ElementTimes:
computationNodePtr = New<ElementTimesNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(ElementTimesNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Equal:
computationNodePtr = New<EqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(EqualNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::NotEqual:
computationNodePtr = New<NotEqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(NotEqualNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Less:
computationNodePtr = New<LessNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(LessNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::LessEqual:
computationNodePtr = New<LessEqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(LessEqualNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Greater:
computationNodePtr = New<GreaterNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(GreaterNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::GreaterEqual:
computationNodePtr = New<GreaterEqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(GreaterEqualNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Times:
{
size_t outputRank = functionConfig[PrimitiveFunction::AttributeNameOutputRank].Value<size_t>();
auto inferInputRankToMap = functionConfig[PrimitiveFunction::AttributeNameInferInputRankToMap].Value<int>();
computationNodePtr = New<TimesNode<ElementType>>(network->GetDeviceId(), internalNodeName, outputRank, inferInputRankToMap);
ASSIGN_NEW_NODE(TimesNode, network->GetDeviceId(), internalNodeName, outputRank, inferInputRankToMap);
break;
}
case PrimitiveOpType::TransposeTimes:
{
size_t outputRank = functionConfig[PrimitiveFunction::AttributeNameOutputRank].Value<size_t>();
computationNodePtr = New<TransposeTimesNode<ElementType>>(network->GetDeviceId(), internalNodeName, outputRank);
ASSIGN_NEW_NODE(TransposeTimesNode, network->GetDeviceId(), internalNodeName, outputRank);
break;
}
case PrimitiveOpType::Convolution:
@ -975,7 +1083,7 @@ namespace CNTK
if (functionConfig.Contains(PrimitiveFunction::AttributeNameOutputShape))
outputShape = functionConfig[PrimitiveFunction::AttributeNameOutputShape].Value<NDShape>();
auto maxTempMemSizeInSamples = functionConfig[PrimitiveFunction::AttributeNameMaxTempMemSizeInSamples].Value<size_t>();
computationNodePtr = New<ConvolutionNode<ElementType>>(network->GetDeviceId(), internalNodeName,
ASSIGN_NEW_NODE(ConvolutionNode, network->GetDeviceId(), internalNodeName,
AsTensorShape(kernelShape), AsTensorShape(outputMapCount), AsTensorShape(strides),
sharing, autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), transpose,
outputShape.IsUnknown() ? TensorShape(0) : AsTensorShape(outputShape),
@ -983,22 +1091,22 @@ namespace CNTK
break;
}
case PrimitiveOpType::CosDistance:
computationNodePtr = New<CosDistanceNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(CosDistanceNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::CosDistanceWithNegativeSamples:
computationNodePtr = New<CosDistanceWithNegativeSamplesNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(CosDistanceWithNegativeSamplesNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Logistic:
computationNodePtr = New<LogisticNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(LogisticNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::SquaredError:
computationNodePtr = New<SquareErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(SquareErrorNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::CrossEntropyWithSoftmax:
computationNodePtr = New<CrossEntropyWithSoftmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(CrossEntropyWithSoftmaxNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::ClassificationError:
computationNodePtr = New<ClassificationErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(ClassificationErrorNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::EditDistanceError:
{
@ -1007,7 +1115,7 @@ namespace CNTK
auto insPen = functionConfig[PrimitiveFunction::AttributeNameInsertionPenalty].Value<float>();
auto squashInputs = functionConfig[PrimitiveFunction::AttributeNameSquashInputs].Value<bool>();
auto tokensToIgnore = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNameTokensToIgnore].Value<std::vector<DictionaryValue>>());
computationNodePtr = New<EditDistanceErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore);
ASSIGN_NEW_NODE(EditDistanceErrorNode, network->GetDeviceId(), internalNodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore);
break;
}
case PrimitiveOpType::LatticeSequenceWithSoftmax:
@ -1033,14 +1141,14 @@ namespace CNTK
{
auto delayContraint = functionConfig[PrimitiveFunction::AttributeNameDelayConstraint].Value<int>();
auto blankTokenId = functionConfig[PrimitiveFunction::AttributeNameBlankTokenId].Value<size_t>();
computationNodePtr = New<ForwardBackwardNode<ElementType>>(network->GetDeviceId(), internalNodeName, blankTokenId, delayContraint);
ASSIGN_NEW_NODE(ForwardBackwardNode, network->GetDeviceId(), internalNodeName, blankTokenId, delayContraint);
break;
}
case PrimitiveOpType::LambdaRank:
computationNodePtr = New<LambdaRankNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(LambdaRankNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::NDCG:
computationNodePtr = New<NDCG1EvalNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(NDCG1EvalNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::PastValue:
case PrimitiveOpType::FutureValue:
@ -1050,9 +1158,9 @@ namespace CNTK
size_t offset = primitiveFunction->Attributes()[PrimitiveFunction::AttributeNameOffset].Value<size_t>();
if (op == PrimitiveOpType::PastValue)
computationNodePtr = New<PastValueNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);
ASSIGN_NEW_NODE(PastValueNode, network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);
else
computationNodePtr = New<FutureValueNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);
ASSIGN_NEW_NODE(FutureValueNode, network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);
break;
}
@ -1080,7 +1188,7 @@ namespace CNTK
);
}
computationNodePtr = New<ReduceElementsNode<ElementType>>(network->GetDeviceId(), internalNodeName, reductionOpName, AsCNTKInternalAxisIdx(reductionAxis), keepDimensions);
ASSIGN_NEW_NODE(ReduceElementsNode, network->GetDeviceId(), internalNodeName, reductionOpName, AsCNTKInternalAxisIdx(reductionAxis), keepDimensions);
break;
}
case PrimitiveOpType::BatchNormalization:
@ -1097,7 +1205,7 @@ namespace CNTK
disableRegularization = functionConfig[PrimitiveFunction::AttributeNameDisableRegularization].Value<bool>();
}
computationNodePtr = New<BatchNormalizationNode<ElementType>>(network->GetDeviceId(), internalNodeName, spatial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, disableRegularization, ImageLayoutKind::CHW);
ASSIGN_NEW_NODE(BatchNormalizationNode, network->GetDeviceId(), internalNodeName, spatial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, disableRegularization, ImageLayoutKind::CHW);
break;
}
case PrimitiveOpType::Combine:
@ -1106,24 +1214,24 @@ namespace CNTK
computationNodePtr = variableToNodeMap[variable];
break;
case PrimitiveOpType::PackedIndex:
computationNodePtr = New<PackedIndexNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(PackedIndexNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::GatherPacked:
computationNodePtr = New<GatherPackedNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(GatherPackedNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::ScatterPacked:
computationNodePtr = New<ScatterPackedNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(ScatterPackedNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Clip:
computationNodePtr = New<ClipNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(ClipNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Select:
computationNodePtr = New<IfNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(IfNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Splice:
{
Axis spliceAxis = functionConfig[PrimitiveFunction::AttributeNameAxis].Value<Axis>();
computationNodePtr = New<RowStackNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(spliceAxis));
ASSIGN_NEW_NODE(RowStackNode, network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(spliceAxis));
break;
}
case PrimitiveOpType::Pad:
@ -1132,7 +1240,7 @@ namespace CNTK
auto foot = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNamePaddingFoot].Value<std::vector<DictionaryValue>>());
auto mode = functionConfig[PrimitiveFunction::AttributeNamePaddingMode].Value<size_t>();
auto constantValue = functionConfig[PrimitiveFunction::AttributeNamePaddingConstantValue].Value<double>();
computationNodePtr = New<PaddingNode<ElementType>>(network->GetDeviceId(), internalNodeName, head, foot, (PaddingType)mode, (ElementType)constantValue);
ASSIGN_NEW_NODE(PaddingNode, network->GetDeviceId(), internalNodeName, head, foot, (PaddingType)mode, constantValue);
break;
}
case PrimitiveOpType::OptimizedRNNStack:
@ -1142,31 +1250,31 @@ namespace CNTK
auto hiddenSize = functionConfig[PrimitiveFunction::AttributeNameHiddenSize].Value<size_t>();
auto recurrentOp = functionConfig[PrimitiveFunction::AttributeNameRecurrentOp].Value<std::wstring>();
computationNodePtr = New<OptimizedRNNStackNode<ElementType>>(network->GetDeviceId(), internalNodeName, bidirectional, numLayers, hiddenSize, recurrentOp);
ASSIGN_NEW_NODE(OptimizedRNNStackNode, network->GetDeviceId(), internalNodeName, bidirectional, numLayers, hiddenSize, recurrentOp);
break;
}
case PrimitiveOpType::ReconcileDynamicAxis:
{
computationNodePtr = New<ReconcileDynamicAxisNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(ReconcileDynamicAxisNode, network->GetDeviceId(), internalNodeName);
break;
}
case PrimitiveOpType::LogSoftmax:
{
//This can be implemented as x => x - ReduceLogSum(x). How to do this here?
computationNodePtr = New<LogSoftmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(LogSoftmaxNode, network->GetDeviceId(), internalNodeName);
break;
}
case PrimitiveOpType::Pass:
computationNodePtr = New<PassNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(PassNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::LabelsToGraph:
computationNodePtr = New<LabelsToGraphNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(LabelsToGraphNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::StopGradient:
computationNodePtr = New<StopGradientNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(StopGradientNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Assign:
computationNodePtr = New<AssignNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(AssignNode, network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::Crop:
if (functionInputs.size() == 2)
@ -1179,24 +1287,41 @@ namespace CNTK
{
CNTK::LogicError("Vector of crop offsets must have size 2.");
}
computationNodePtr = New<CropNode<ElementType>>(offsets[0], offsets[1], network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(CropNode, offsets[0], offsets[1], network->GetDeviceId(), internalNodeName);
}
else
{
// Crop with two inputs and automatic offset computation.
computationNodePtr = New<CropNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(CropNode, network->GetDeviceId(), internalNodeName);
}
}
else if (functionInputs.size() == 4)
{
// Crop with four inputs and automatic offset computation.
computationNodePtr = New<CropNode<ElementType>>(network->GetDeviceId(), internalNodeName);
ASSIGN_NEW_NODE(CropNode, network->GetDeviceId(), internalNodeName);
}
else
{
CNTK::LogicError("Crop node must have 2 or 4 node inputs.");
}
break;
case PrimitiveOpType::Cast:
{
DataType outputType = (DataType)functionConfig[PrimitiveFunction::AttributeNameNewDataType].Value<int>();
switch (outputType)
{
case DataType::Float:
ASSIGN_NEW_NODE2(CastNode, float, network->GetDeviceId(), internalNodeName);
break;
case DataType::Double:
ASSIGN_NEW_NODE2(CastNode, double, network->GetDeviceId(), internalNodeName);
break;
case DataType::Float16:
ASSIGN_NEW_NODE2(CastNode, half, network->GetDeviceId(), internalNodeName);
break;
}
break;
}
default:
CNTK::LogicError("Specified op %S not yet supported", PrimitiveOpTypeName(op).c_str());
break;
@ -1224,7 +1349,7 @@ namespace CNTK
}
else
{
computationNodePtr = New<UserDefinedV2FunctionNode<ElementType>>(network->GetDeviceId(), internalNodeName, function->shared_from_this());
ASSIGN_NEW_NODE(UserDefinedV2FunctionNode, network->GetDeviceId(), internalNodeName, function->shared_from_this());
// For user defined functions, we only attach unique inputs in the internal computation network since, the UDF
// backward implementations directly compute aggregate gradient values for unique inputs
@ -1244,7 +1369,7 @@ namespace CNTK
while (outputs[i] != variable) i++;
assert(i < outputs.size());
computationNodePtr = New<OutputMultiplexerNode<ElementType>>(network->GetDeviceId(), CNTKInternalNodeNameFromUidAndName(variable.Uid(), variable.Name(), useMangledNamesForComputationNodes), i);
ASSIGN_NEW_NODE(OutputMultiplexerNode, network->GetDeviceId(), CNTKInternalNodeNameFromUidAndName(variable.Uid(), variable.Name(), useMangledNamesForComputationNodes), i);
inputNodesBasePtrs = { variableToNodeMap[outputs[0]] };
}
@ -1279,15 +1404,16 @@ namespace CNTK
}
// Create the nodes corresponding to the inputs
std::vector<std::shared_ptr<ComputationNode<ElementType>>> inputNodes;
std::vector<std::shared_ptr<ComputationNodeBase>> inputNodes;
for (auto& inputVar : functionInputs)
{
// If the inputVar is a constant and not the right DataType let's coerce it to the right type
if (inputVar.IsConstant() && (nonConstInputDataType != DataType::Unknown) && (inputVar.GetDataType() != nonConstInputDataType))
// except for FP16 that mismatch is needed (e.g. BatchNorm stats in FP16 need to be FP32)
if (inputVar.IsConstant() && (nonConstInputDataType != DataType::Unknown) && (nonConstInputDataType != DataType::Float16) && (inputVar.GetDataType() != nonConstInputDataType))
inputVar = Constant(inputVar).CloneAs(nonConstInputDataType);
auto baseNodePtr = GetNode(inputVar, network, builder, fullyDefinedArgumentsMap, variableToNodeMap, isVariableRootMap, inputsToExcludeGradientsFor, useMangledNamesForComputationNodes);
inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr->template As<ComputationNode<ElementType>>()->shared_from_this() : nullptr);
inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr : nullptr);
}
BlockFunction* blockFunction = dynamic_cast<BlockFunction*>(function);
@ -1303,7 +1429,7 @@ namespace CNTK
return GetNode(variable.BlockFunctionVariableMapping(), network, builder, fullyDefinedArgumentsMap, variableToNodeMap, isVariableRootMap, inputsToExcludeGradientsFor, useMangledNamesForComputationNodes);
}
else
computationNodePtr = CreateComputationNode(variable, function, inputNodes, network, variableToNodeMap, useMangledNamesForComputationNodes);
computationNodePtr = CreateComputationNode<ElementType>(variable, function, inputNodes, network, variableToNodeMap, useMangledNamesForComputationNodes);
PrimitiveFunction* primitiveFunction = dynamic_cast<PrimitiveFunction*>(function);
if (!primitiveFunction || (primitiveFunction->OpType() != PrimitiveOpType::Combine))
@ -1732,6 +1858,9 @@ namespace CNTK
case DataType::Double:
PopulateComputationNodeValue<double>({ argument, argumentValue }, argumentComputationNode, layoutsPopulated);
break;
case DataType::Float16:
PopulateComputationNodeValue<half>({ argument, argumentValue }, argumentComputationNode, layoutsPopulated);
break;
default:
LogicError("Function '%S' Forward: Unsupported DataType %s.", AsString().c_str(), DataTypeName(argumentValue->GetDataType()));
break;
@ -1775,6 +1904,9 @@ namespace CNTK
case DataType::Double:
PopulateComputationNodeGradient<double>(gradientVarValuePair, outputComputationNode);
break;
case DataType::Float16:
PopulateComputationNodeGradient<half>(gradientVarValuePair, outputComputationNode);
break;
default:
LogicError("Function '%S' Backward: Unsupported DataType %s.", AsString().c_str(), DataTypeName(gradientValue->GetDataType()));
break;
@ -1816,6 +1948,15 @@ namespace CNTK
nodeValue = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(var, computationNode, matrix, layout);
break;
}
case DataType::Float16:
{
auto& matrix = getGradient ? computationNode->As<ComputationNode<half>>()->Gradient() : computationNode->As<ComputationNode<half>>()->Value();
if (varValue == nullptr)
nodeValue = MakeSharedObject<PackedValue>(varShape, var.DynamicAxes(), std::make_shared<Matrix<half>>(matrix.AsReference()), layout, /*readOnly =*/ false);
else
nodeValue = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<half>(var, computationNode, matrix, layout);
break;
}
default:
CNTK::LogicError("CompositeFunction::Forward/Backward: Unsupported DataType %s", DataTypeName(var.GetDataType()));
break;
@ -1973,6 +2114,8 @@ namespace CNTK
GetComputationNetwork<float>(computeDevice, outputsToRetainBackwardStateFor, requestedOutputVariables, inputsToExcludeGradientsFor, true);
else if (dataType == DataType::Double)
GetComputationNetwork<double>(computeDevice, outputsToRetainBackwardStateFor, requestedOutputVariables, inputsToExcludeGradientsFor, true);
else if (dataType == DataType::Float16)
GetComputationNetwork<half>(computeDevice, outputsToRetainBackwardStateFor, requestedOutputVariables, inputsToExcludeGradientsFor, true);
else
InvalidArgument("Unsupported DataType %s", DataTypeName(dataType));

Просмотреть файл

@ -286,7 +286,7 @@ namespace CNTK
template <typename ElementType>
static Microsoft::MSR::CNTK::ComputationNodeBasePtr CreateComputationNode(const Variable& variable,
Function* function,
const std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNode<ElementType>>>& inputNodes,
const std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>& inputNodes,
Microsoft::MSR::CNTK::ComputationNetworkPtr& network,
std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr>& variableToNodeMap,
bool useMangledNamesForComputationNodes);
@ -359,6 +359,11 @@ namespace CNTK
refVar.IsParameter() ? Parameter(refVar).RecordValueUpdate() : Constant(refVar).RecordValueUpdate();
}
template <typename ElementType>
static Microsoft::MSR::CNTK::ComputationNodeBasePtr CreateLearnableParameterFromVariable(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, const NDShape& shape, const std::wstring& name);
static void CastAssignNodeValue(Microsoft::MSR::CNTK::ComputationNodeBasePtr node, DataType dataType, std::shared_ptr<const Microsoft::MSR::CNTK::MatrixBase> matrix);
private:
// Set of all primitive functions in the graph underlying 'this' Function. Also keeps the primitive Function objects alive

Просмотреть файл

@ -133,7 +133,7 @@ namespace CNTK
#endif
if (info.IsEmpty())
PrepaireZeroGradients(gradientValues, info);
PrepaireZeroGradients(gradientValues);
// sorts gradient buffers according to parameter uid, and perform sparse to dense conversion
// if !UseSparseGradientAggregationInDataParallelSGD()

Просмотреть файл

@ -64,7 +64,9 @@ namespace CNTK
return viewPtr->WritableDataBuffer<float>();
if (viewPtr->GetDataType() == DataType::Double)
return viewPtr->WritableDataBuffer<double>();
if (viewPtr->GetDataType() == DataType::Float16)
return viewPtr->WritableDataBuffer<float16>();
LogicError("Unknown DataType");
return nullptr; // Make compiler happy.
}
@ -398,6 +400,11 @@ namespace CNTK
AllReduceData(static_cast<double*>(inputData), static_cast<double*>(outputData), numElements,
&allReduceRequests, (inputValue->Device() == DeviceDescriptor::CPUDevice()));
}
else if (dataType == DataType::Float16)
{
AllReduceDataHalf(static_cast<half*>(inputData), static_cast<half*>(outputData), numElements,
&allReduceRequests, (inputValue->Device() == DeviceDescriptor::CPUDevice()));
}
else
LogicError("MPICommunicator: Unknown DataType.");
}
@ -581,8 +588,10 @@ namespace CNTK
if (sbc->GetDataType() == DataType::Float)
AllReduceData<float>((float*)nz, (float*)nz, requiredElements, nullptr, aggregateOnCPU, MPI_SUM, true);
else
else if (sbc->GetDataType() == DataType::Double)
AllReduceData<double>((double*)nz, (double*)nz, requiredElements, nullptr, aggregateOnCPU, MPI_SUM, true);
else if (sbc->GetDataType() == DataType::Float16)
AllReduceDataHalf((half*)nz, (half*)nz, requiredElements, nullptr, aggregateOnCPU, MPI_SUM, true);
if (aggregateOnCPU)
{
@ -719,4 +728,17 @@ namespace CNTK
else
m_mpi->AllReduceAsync(inputData, outputData, numElements, &(pAllReduceRequests->back()), op);
}
void MPICommunicatorImpl::AllReduceDataHalf(half* inputData, half* outputData, size_t numElements, std::vector<MPI_Request>* pAllReduceRequests, bool dataOnCPU, MPI_Op op, bool forceSync)
{
if (m_nccl->IsSupported() && !dataOnCPU)
{
m_nccl->AllReduce(inputData, outputData, numElements, op);
return;
}
//half aggregation other than NCCL is not supported
NOT_IMPLEMENTED;
}
}

Просмотреть файл

@ -143,5 +143,7 @@ namespace CNTK
template <typename ElemType>
void AllReduceData(ElemType* inputData, ElemType* outputData, size_t numElements, std::vector<MPI_Request>* pAllReduceRequests, bool dataOnCPU, MPI_Op op = MPI_SUM, bool forceSync = false);
void AllReduceDataHalf(half* inputData, half* outputData, size_t numElements, std::vector<MPI_Request>* pAllReduceRequests, bool dataOnCPU, MPI_Op op = MPI_SUM, bool forceSync = false);
};
}

Просмотреть файл

@ -36,7 +36,7 @@ namespace CNTK
m_sampleCount = checkpoint[L"totalNumberOfSamplesSeen"].Value<size_t>();
}
void DistributedLearnerBase::PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, MinibatchInfo& info)
void DistributedLearnerBase::PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues)
{
// Need to initialize gradients to 0 in case when it is an empty minibatch.
for (auto& g : gradientValues)
@ -44,10 +44,6 @@ namespace CNTK
auto weights = g.first.Value();
g.second = MakeSharedObject<NDArrayView>(0, weights->GetDataType(), weights->Shape(), weights->Device());
}
auto dataType = gradientValues.begin()->first.GetDataType();
info.evalCriterionValue = MakeSharedObject<NDArrayView>(0, dataType, NDShape{}, DeviceDescriptor::UseDefaultDevice());
info.trainingLossValue = MakeSharedObject<NDArrayView>(0, dataType, NDShape{}, DeviceDescriptor::UseDefaultDevice());
}
void DistributedLearnerBase::ConvertToOrdered(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, std::vector<std::pair<Parameter, NDArrayViewPtr>>& result, std::unordered_map<Parameter, NDArrayViewPtr>* convertedGradientValues)

Просмотреть файл

@ -22,7 +22,7 @@ namespace CNTK
protected:
DistributedLearnerBase(DistributedCommunicatorPtr communicator, LearnerPtr learner, size_t distributeAfterSamples, bool convertSparseToDense=true);
static void PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, MinibatchInfo& info);
static void PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues);
void ConvertToOrdered(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, std::vector<std::pair<Parameter, NDArrayViewPtr>>& result, std::unordered_map<Parameter, NDArrayViewPtr>* convertedGradientValues = nullptr);
std::vector<std::pair<Parameter, NDArrayViewPtr>> m_gradientBuffer;

Просмотреть файл

@ -44,7 +44,7 @@ namespace CNTK
m_aggregatedEvaluationFunction = m_evaluationFunction;
m_testSampleCountVar = m_evaluationFunction->RootFunction()->Inputs()[0];
}
if(initializeCombined)
m_combinedEvalFunction = Combine(GetCombinedEvalFunctionArgs());
}

Просмотреть файл

@ -2098,14 +2098,14 @@ namespace CNTK
FunctionPtr classificationErrorComposite;
if (axis == Axis(0))
classificationErrorComposite = Minus(Constant::Scalar(1.0f), TransposeTimes(labelPlaceholder, Hardmax(predictionPlaceholder)));
classificationErrorComposite = Minus(Constant::Scalar(prediction.GetDataType(), 1.0), TransposeTimes(labelPlaceholder, Hardmax(predictionPlaceholder)));
else
{
auto axMax = ReduceMax(predictionPlaceholder, axis);
auto pred = Equal(predictionPlaceholder, axMax);
auto wrongPred = NotEqual(labelPlaceholder, pred);
auto axErr = ReduceSum(wrongPred, axis);
auto capErr = GreaterEqual(axErr, Constant::Scalar(1.0f));
auto capErr = GreaterEqual(axErr, Constant::Scalar(prediction.GetDataType(), 1.0));
classificationErrorComposite = ReduceMean(capErr, Axis::AllStaticAxes());
}
@ -2938,6 +2938,17 @@ namespace CNTK
operands, Dictionary(), name), name);
}
FunctionPtr Cast(const Variable& nodeInput, DataType outputType, const std::wstring& name)
{
std::vector<Variable> operands = { nodeInput };
Dictionary additionalAttributes;
additionalAttributes.Add(
PrimitiveFunction::AttributeNameNewDataType, static_cast<int>(outputType));
return AsComposite(MakeSharedObject<PrimitiveFunction>(
PrimitiveOpType::Cast,
operands, std::move(additionalAttributes), name), name);
}
namespace Internal
{
FunctionPtr IsWithin(const Variable& operand, int offset, const std::wstring& name)

Просмотреть файл

@ -10,7 +10,7 @@
#include "Serialization.h"
#define DISPATCH_TO_TYPED_UPDATE_FUNCTION \
switch (smoothedGradientValue->GetDataType()) \
switch (gradientValue->GetDataType()) \
{ \
case DataType::Float: \
Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
@ -18,6 +18,9 @@
case DataType::Double: \
Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
break; \
case DataType::Float16: \
Update<half>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
break; \
default: \
NOT_IMPLEMENTED; \
}
@ -203,10 +206,10 @@ namespace CNTK
LearnerBase::LearnerBase(const vector<Parameter>& parameters,
const LearningRateSchedule& learningRateSchedule,
AdditionalLearningOptions additionalOptions,
bool allocateSmoothGradients /* = true */)
AdditionalLearningOptions additionalOptions)
: Learner(parameters, learningRateSchedule, additionalOptions),
m_noiseInjectionSeed(Internal::GenerateRandomSeed())
m_noiseInjectionSeed(Internal::GenerateRandomSeed()),
m_masterParameterUpdated(false)
{
if (parameters.empty())
InvalidArgument("The parameters list specified to a Learner must not be empty.");
@ -215,22 +218,40 @@ namespace CNTK
if (uniqueParameters.size() != parameters.size())
InvalidArgument("Learner's parameters list must not contain duplicates.");
}
if (allocateSmoothGradients)
void LearnerBase::AllocateSmoothedGradients(const std::vector<Parameter>& parameters, size_t factor, size_t fp16Factor)
{
for (const auto& parameter : parameters)
{
for (const auto& parameter : parameters)
{
NDArrayViewPtr view = AllocateNDArrayView(parameter, parameter.Shape());
m_smoothedGradientValues.emplace(parameter, view);
}
NDArrayViewPtr view = AllocateSmoothedGradientFor(parameter, factor, fp16Factor);
m_smoothedGradientValues.emplace(parameter, view);
}
}
/*static*/ NDArrayViewPtr LearnerBase::AllocateNDArrayView(const Parameter& parameter, const NDShape& shape)
/*static*/ NDArrayViewPtr LearnerBase::AllocateSmoothedGradientFor(const Parameter& parameter, size_t factor, size_t fp16Factor)
{
if (parameter.GetDataType() == DataType::Float)
// float16 parameter needs extra buffer for master-copy of weights
if (parameter.GetDataType() == DataType::Float16) factor += fp16Factor;
const auto paramShape = GetMatrixShape(parameter);
NDShape shape;
if (factor == 0)
{
return MakeSharedObject<NDArrayView>(float(0.0), shape, parameter.Value()->Device());
shape = NDShape({});
}
else
{
if (factor == 1)
shape = parameter.Shape();
else
shape = NDShape({ paramShape[0], factor * paramShape[1] });
}
if (parameter.GetDataType() != DataType::Double)
{
// float and half both have smoothed gradient in float
return MakeSharedObject<NDArrayView>(0.0f, shape, parameter.Value()->Device());
}
else
{
@ -245,11 +266,16 @@ namespace CNTK
auto matrix = GetMatrix<float>(parameter.Value());
return{ matrix->GetNumRows(), matrix->GetNumCols() };
}
else
else if (parameter.GetDataType() == DataType::Double)
{
auto matrix = GetMatrix<double>(parameter.Value());
return{ matrix->GetNumRows(), matrix->GetNumCols() };
}
else
{
auto matrix = GetMatrix<half>(parameter.Value());
return{ matrix->GetNumRows(), matrix->GetNumCols() };
}
}
/*virtual*/ bool LearnerBase::Update(unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount, bool sweepEnd) /*override*/
@ -267,10 +293,22 @@ namespace CNTK
UpdateOnMinibatch(trainingSampleCount);
bool needUpdateMasterParameter = !m_masterParameterUpdated;
for (const auto& parameter : Parameters())
{
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
const auto& gradientValue = gradientValues.at(parameter);
if (needUpdateMasterParameter && parameter.GetDataType() == DataType::Float16)
{
// convert fp16 parameter to fp32
auto sg = smoothedGradientValue->GetWritableMatrix<float>();
auto pv16 = parameter.Value()->GetWritableMatrix<half>();
size_t factor = sg->GetNumCols() / pv16->GetNumCols();
auto pv = sg->ColumnSlice(pv16->GetNumCols() * (factor - 1), pv16->GetNumCols());
pv.CastAssignValuesOf(*pv16);
}
// TODO: make this a runtime parameter.
#if DUMPOUTPUT
LOGPRINTF(stderr, "Update_%ls\n", parameter.Uid().c_str());
@ -303,6 +341,11 @@ namespace CNTK
LogicError("%ls has NaNs in parameter values after parameter update.", parameter.Uid().c_str());
#endif
}
if (needUpdateMasterParameter)
{
m_masterParameterUpdated = true;
}
m_sampleCount += trainingSampleCount;
m_minibatchCount++;
if (sweepEnd)
@ -319,7 +362,19 @@ namespace CNTK
{
const auto& parameterValue = parameter.Value();
PreProcess<ElementType>(parameterValue, gradientValue, trainingSampleCount);
Update(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
if (parameter.GetDataType() == DataType::Float16)
{
// convert fp32 parameter to fp16 after update
auto sg = smoothedGradientValue->GetWritableMatrix<float>();
auto pv16 = parameterValue->GetWritableMatrix<half>();
size_t factor = sg->GetNumCols() / pv16->GetNumCols();
auto pv = sg->ColumnSlice(pv16->GetNumCols() * (factor - 1), pv16->GetNumCols());
pv16->CastAssignValuesOf(pv);
}
PostProcess<ElementType>(parameter, gradientValue, trainingSampleCount);
auto paramRef = parameter;
@ -344,6 +399,7 @@ namespace CNTK
checkpoint[sweepCountKey] = m_sweepCount;
checkpoint[learningRateScheduleKey] = m_learningRateSchedule.Serialize();
checkpoint[noiseInjectionSeedKey] = m_noiseInjectionSeed;
checkpoint[masterParameterUpdatedKey] = m_masterParameterUpdated;
// TODO: should we also save momentum schedule into the checkpoint?
// If that is the case, need to be able to override this method in subclasses.
@ -390,6 +446,11 @@ namespace CNTK
m_noiseInjectionSeed = checkpoint[noiseInjectionSeedKey].Value<size_t>();
}
if (checkpoint.Contains(masterParameterUpdatedKey))
{
m_masterParameterUpdated = checkpoint[masterParameterUpdatedKey].Value<bool>();
}
// TODO: which learning rate schedule should take precedence here?
// The one given at construction time or the one loaded from a checkpoint?
m_learningRateSchedule = TrainingParameterSchedule<double>::Deserialize(checkpoint[learningRateScheduleKey].Value<Dictionary>());
@ -464,23 +525,6 @@ namespace CNTK
}
}
LearnerSGD::LearnerSGD(const std::vector<Parameter>& parameters,
const LearningRateSchedule& learningRateSchedule,
AdditionalLearningOptions additionalOptions,
bool allocateSmoothGradients)
: LearnerBase(parameters, learningRateSchedule, additionalOptions, allocateSmoothGradients)
{
if (!allocateSmoothGradients)
{
// the vanilla sgd does not need the smooth gradients per se,
// insert dummy nd views instead.
for (const auto& parameter : parameters)
{
m_smoothedGradientValues.emplace(parameter, AllocateNDArrayView(parameter, {}));
}
}
}
/*virtual*/ void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) /*override*/
{
@ -514,7 +558,20 @@ namespace CNTK
{
ReportTrainingParameterValue(m_momentumSchedule, L"Momentum");
DISPATCH_TO_TYPED_UPDATE_FUNCTION;
switch (gradientValue->GetDataType())
{
case DataType::Float:
Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
break;
case DataType::Double:
Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
break;
case DataType::Float16:
UpdateHalf(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
break;
default:
NOT_IMPLEMENTED;
}
}
template <typename ElementType>
@ -555,10 +612,42 @@ namespace CNTK
learningRate, momentum, unitGainFactor);
}
void LearnerMomentumSGD::UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
{
const auto& compoundMatrix = GetWritableMatrix<float>(smoothedGradientValue);
const auto& gradientMatrix = GetWritableMatrix<half>(gradientValue);
auto smoothedGradientMatrix = compoundMatrix->ColumnSlice(0, gradientMatrix->GetNumCols());
auto tempGradientMatrix = compoundMatrix->ColumnSlice(gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
auto parameterMatrix = compoundMatrix->ColumnSlice(2 * gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
tempGradientMatrix.CastAssignValuesOf(*gradientMatrix);
const auto learningRate = float(LearningRate(trainingSampleCount));
const auto momentum = float(MomentumValueForMB(trainingSampleCount));
const auto unitGainFactor = UnitGainFactor<float>(trainingSampleCount);
parameterMatrix.MomentumSGDUpdate(tempGradientMatrix, smoothedGradientMatrix,
learningRate, momentum, unitGainFactor);
}
/*virtual*/ void LearnerNesterov::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) /*override*/
{
DISPATCH_TO_TYPED_UPDATE_FUNCTION;
switch (gradientValue->GetDataType())
{
case DataType::Float:
Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
break;
case DataType::Double:
Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
break;
case DataType::Float16:
UpdateHalf(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
break;
default:
NOT_IMPLEMENTED;
}
}
template <typename ElementType>
@ -575,11 +664,30 @@ namespace CNTK
learningRate, momentum, unitGainFactor);
}
void LearnerNesterov::UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
{
const auto& compoundMatrix = GetWritableMatrix<float>(smoothedGradientValue);
const auto& gradientMatrix = GetWritableMatrix<half>(gradientValue);
auto smoothedGradientMatrix = compoundMatrix->ColumnSlice(0, gradientMatrix->GetNumCols());
auto tempGradientMatrix = compoundMatrix->ColumnSlice(gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
auto parameterMatrix = compoundMatrix->ColumnSlice(2 * gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
tempGradientMatrix.CastAssignValuesOf(*gradientMatrix);
const auto learningRate = float(LearningRate(trainingSampleCount));
const auto momentum = float(MomentumValueForMB(trainingSampleCount));
const auto unitGainFactor = UnitGainFactor<float>(trainingSampleCount);
parameterMatrix.NesterovAcceleratedMomentumSGDUpdate(tempGradientMatrix, smoothedGradientMatrix,
learningRate, momentum, unitGainFactor);
}
LearnerAdaGrad::LearnerAdaGrad(const std::vector<Parameter>& parameters,
const LearningRateSchedule& learningRateSchedule,
bool needAveMultiplier,
AdditionalLearningOptions additionalOptions)
: LearnerBase(parameters, learningRateSchedule, additionalOptions, /*allocateSmoothGradients*/ false),
: LearnerBase(parameters, learningRateSchedule, additionalOptions),
m_needAveMultiplier(needAveMultiplier)
{
for (const auto& parameter : parameters)
@ -591,8 +699,7 @@ namespace CNTK
factor = 2;
}
const auto shape = GetMatrixShape(parameter);
NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], factor * shape[1] });
NDArrayViewPtr view = AllocateSmoothedGradientFor(parameter, factor);
m_smoothedGradientValues.emplace(parameter, view);
}
@ -621,21 +728,29 @@ namespace CNTK
const LearningRateSchedule& learningRateSchedule,
double rho, double epsilon,
AdditionalLearningOptions additionalOptions)
: LearnerBase(parameters, learningRateSchedule, additionalOptions, /*allocateSmoothGradients*/ false),
: LearnerBase(parameters, learningRateSchedule, additionalOptions),
m_rho(rho), m_epsilon(epsilon)
{
for (const auto& parameter : parameters)
{
const auto shape = GetMatrixShape(parameter);
NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], 2 * shape[1] });
m_smoothedGradientValues.emplace(parameter, view);
}
AllocateSmoothedGradients(parameters, 2);
}
/*virtual*/ void LearnerAdaDelta::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) /*override*/
{
DISPATCH_TO_TYPED_UPDATE_FUNCTION;
switch (gradientValue->GetDataType())
{
case DataType::Float:
Update<float, float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
break;
case DataType::Double:
Update<double, double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
break;
case DataType::Float16:
Update<half, float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
break;
default:
NOT_IMPLEMENTED;
}
}
// When the gradients are sparse, we update the corresponding internal buffers of adadelta in a sparse way
@ -645,11 +760,16 @@ namespace CNTK
// TODO: consider exposing this somehow so that it is easy to test by setting it to small value.
/* static */ const int LearnerAdaDelta::s_SyncInterval = 1 << 20;
template <typename ElementType>
template <typename GradType, typename AccumType>
void LearnerAdaDelta::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount)
{
GET_WRITABLE_MATRICES
const auto& gradientMatrix = GetWritableMatrix<GradType>(gradientValue);
const auto& smoothedGradientMatrix = GetWritableMatrix<AccumType>(smoothedGradientValue);
// parameter is accumulated to fp32 for fp16 gradient in the master copy (allocated in last part in smoothedGradient)
auto parameterMatrix = (std::is_same<GradType, half>::value) ?
smoothedGradientMatrix->ColumnSlice(smoothedGradientMatrix->GetNumCols() - gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols()) :
GetWritableMatrix<AccumType>(parameter.Value())->ColumnSlice(0, gradientMatrix->GetNumCols());
const auto learningRate = LearningRate(trainingSampleCount);
@ -685,14 +805,14 @@ namespace CNTK
if (currentTimestamp >= LearnerAdaDelta::s_SyncInterval)
{
// Once in a while sync the state and reset the timestamps and current time to 0
smoothedGradientMatrix->AdaDeltaFlushState(numCols, (ElementType)m_rho, timestamps, currentTimestamp);
smoothedGradientMatrix->AdaDeltaFlushState(numCols, (AccumType)m_rho, timestamps, currentTimestamp);
m_currentTime[parameter] = currentTimestamp = 0;
}
currentTimestamp += 1;
m_currentTime[parameter] = currentTimestamp;
}
smoothedGradientMatrix->AdaDeltaUpdate(*gradientMatrix, *parameterMatrix, (ElementType)learningRate, (ElementType)m_rho, (ElementType)m_epsilon, timestamps, currentTimestamp);
smoothedGradientMatrix->template AdaDeltaUpdate<GradType>(*gradientMatrix, parameterMatrix, (AccumType)learningRate, (AccumType)m_rho, (AccumType)m_epsilon, timestamps, currentTimestamp);
}
/*virtual*/ Dictionary LearnerAdaDelta::CreateCheckpoint() /*override*/
@ -713,12 +833,15 @@ namespace CNTK
const auto& smoothedGradientMatrix = GetWritableMatrix<float>(smoothedGradientValue);
smoothedGradientMatrix->AdaDeltaFlushState(numCols, (float)m_rho, timestamps, currentTimestamp);
}
else
else if (parameter.GetDataType() == CNTK::DataType::Double)
{
const auto numCols = GetMatrix<double>(parameter.Value())->GetNumCols();
const auto& smoothedGradientMatrix = GetWritableMatrix<double>(smoothedGradientValue);
smoothedGradientMatrix->AdaDeltaFlushState(numCols, (double)m_rho, timestamps, currentTimestamp);
}
else
LogicError("Unexpected parameter data type");
m_currentTime[parameter] = 0;
}
return LearnerBase::CreateCheckpoint();
@ -748,16 +871,10 @@ namespace CNTK
const MomentumSchedule& varianceMomentumSchedule,
AdditionalLearningOptions additionalOptions)
: LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule,
unitGain, additionalOptions, /*allocateSmoothGradients*/ false),
unitGain, additionalOptions, 2),
m_varianceMomentumSchedule(varianceMomentumSchedule),
m_smoothedCount(0.0)
{
for (const auto& parameter : parameters)
{
const auto shape = GetMatrixShape(parameter);
NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], 2 * shape[1] });
m_smoothedGradientValues.emplace(parameter, view);
}
}
/*virtual*/ Dictionary LearnerFSAdaGrad::CreateCheckpoint() /*override*/
@ -823,7 +940,7 @@ namespace CNTK
bool adamax,
AdditionalLearningOptions additionalOptions)
: LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule,
unitGain, additionalOptions, /*allocateSmoothGradients*/ false),
unitGain, additionalOptions, 2),
m_varianceMomentumSchedule(varianceMomentumSchedule), m_epsilon(epsilon),
m_adamax(adamax)
{
@ -833,12 +950,8 @@ namespace CNTK
InvalidArgument("Epsilon should be non-negative. You are trying to set it to %g.", m_epsilon);
}
for (const auto& parameter : parameters)
{
const auto shape = GetMatrixShape(parameter);
NDArrayViewPtr view = AllocateNDArrayView(parameter, {shape[0], 2 * shape[1]});
m_smoothedGradientValues.emplace(parameter, view);
}
AllocateSmoothedGradients(parameters, 2);
m_smoothedCount = 0.0;
}
@ -893,7 +1006,7 @@ namespace CNTK
double gamma, double inc, double dec, double max, double min,
bool needAveMultiplier,
AdditionalLearningOptions additionalOptions)
: LearnerBase(parameters, learningRateSchedule, additionalOptions, /*allocateSmoothGradients*/ false),
: LearnerBase(parameters, learningRateSchedule, additionalOptions),
m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min), m_needAveMultiplier(needAveMultiplier)
{
// validation of learner settings
@ -922,7 +1035,7 @@ namespace CNTK
}
const auto shape = GetMatrixShape(parameter);
NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], factor * shape[1] });
NDArrayViewPtr view = AllocateSmoothedGradientFor(parameter, factor);
m_smoothedGradientValues.emplace(parameter, view);
}
@ -996,7 +1109,7 @@ namespace CNTK
bool unitGain,
AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
{
return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions);
return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions, 1);
}
LearnerPtr NesterovLearner(const vector<Parameter>& parameters,
@ -1059,7 +1172,7 @@ namespace CNTK
LearnerUniversal::LearnerUniversal(const std::vector<Parameter>& parameters, const ParameterUpdateFunctor& func)
: LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions(), /*allocateSmoothGradients*/ false)
: LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions())
{
std::vector<Variable> gradients;
std::vector<FunctionPtr> functions;
@ -1083,7 +1196,7 @@ namespace CNTK
}
LearnerUniversal::LearnerUniversal(const std::vector<Parameter>& parameters, const std::vector<Variable>& gradients, FunctionPtr updateFunc)
: LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions(), /*allocateSmoothGradients*/ false)
: LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions())
{
ValidateInput(parameters, gradients, updateFunc);
}
@ -1107,7 +1220,7 @@ namespace CNTK
fprintf(stderr, "WARNING: Update function does not contain the gradient for parameter %ls in its computation\n", param.AsString().c_str());
m_parameter_gradient_map.insert({parameters[i], gradients[i]});
}
AllocateDummySmoothedGradients(parameters);
AllocateSmoothedGradients(parameters, 0);
m_update_func = updateFunc;
}

Просмотреть файл

@ -30,14 +30,11 @@ namespace CNTK
virtual void ResetSmoothedGradients() override;
protected:
// allocateSmoothGradients flag specifies whether NDArrayViews for smoothed gradients can be allocated
// in the base class constructor (in which case they are allocated with the shapes identical to the shapes of
// the corresponding parameters) or if the allocation should be deferred to the subclass constructor (which
// performs allocation that is specific to the particular learner, see FSAdaGrad and RMSProp).
LearnerBase(const std::vector<Parameter>& parameters,
const LearningRateSchedule& learningRateSchedule,
AdditionalLearningOptions additionalOptions,
bool allocateSmoothGradients = true);
AdditionalLearningOptions additionalOptions);
void AllocateSmoothedGradients(const std::vector<Parameter>& parameters, size_t factor, size_t fp16Factor = 1);
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) = 0;
@ -82,6 +79,8 @@ namespace CNTK
std::unordered_map<Parameter, NDArrayViewPtr> m_smoothedGradientValues;
bool m_masterParameterUpdated; // whether the master copy of parameters are updated
mutable size_t m_noiseInjectionSeed;
// The following four static protected methods expose private methods of NDArrayView class
@ -113,7 +112,7 @@ namespace CNTK
// Returns an NDArrayView with the required shape, with the same data type as parameter value
// and allocated on the same device.
static NDArrayViewPtr AllocateNDArrayView(const Parameter& parameter, const NDShape& shape);
static NDArrayViewPtr AllocateSmoothedGradientFor(const Parameter& parameter, size_t factor, size_t fp16Factor = 1);
// Retrieves the shape of the matrix corresponding to the parameter value.
static NDShape GetMatrixShape(const Parameter& parameter);
@ -142,8 +141,11 @@ namespace CNTK
public:
LearnerSGD(const std::vector<Parameter>& parameters,
const LearningRateSchedule& learningRateSchedule,
AdditionalLearningOptions additionalOptions,
bool allocateSmoothGradients = false);
AdditionalLearningOptions additionalOptions)
: LearnerBase(parameters, learningRateSchedule, additionalOptions)
{
AllocateSmoothedGradients(parameters, 0);
}
protected:
@ -162,11 +164,13 @@ namespace CNTK
const MomentumSchedule& momentumSchedule,
bool unitGain,
AdditionalLearningOptions additionalOptions,
bool allocateSmoothGradients = true)
: LearnerBase(parameters, learningRateSchedule, additionalOptions, allocateSmoothGradients),
size_t smoothGradientFactor)
: LearnerBase(parameters, learningRateSchedule, additionalOptions),
m_momentumSchedule(momentumSchedule),
m_unitGain(unitGain)
{ }
{
AllocateSmoothedGradients(parameters, smoothGradientFactor, 2);
}
// returns current per-minibatch momentum value.
virtual double MomentumValueForMB(size_t minibatchSize) const
@ -177,9 +181,11 @@ namespace CNTK
protected:
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) override;
template <typename ElementType>
template <typename ElemType>
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
void UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
// returns current per-minibatch momentum value from the provided schedule.
double MomentumValueForMB(const MomentumSchedule& schedule, size_t minibatchSize) const;
@ -216,7 +222,7 @@ namespace CNTK
const MomentumSchedule& momentumSchedule,
bool unitGain,
AdditionalLearningOptions additionalOptions)
: LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions, /*allocateSmoothGradients*/ true)
: LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions, 1)
{}
protected:
@ -224,6 +230,7 @@ namespace CNTK
template <typename ElementType>
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
void UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
};
class LearnerAdaGrad : public LearnerBase
@ -268,7 +275,7 @@ namespace CNTK
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) override;
template <typename ElementType>
template <typename GradType, typename AccumType>
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount);
virtual Dictionary CreateCheckpoint() override;
@ -404,14 +411,6 @@ namespace CNTK
virtual bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount, bool sweepEnd) override;
private:
void AllocateDummySmoothedGradients(const std::vector<Parameter>& parameters)
{
for (const auto& parameter : parameters)
{
m_smoothedGradientValues.emplace(parameter, AllocateNDArrayView(parameter, {}));
}
}
void ValidateInput(const std::vector<Parameter>& parameters, const std::vector<Variable>& gradients, FunctionPtr updateFunc);

Просмотреть файл

@ -17,8 +17,20 @@ using namespace Microsoft::MSR::CNTK;
namespace CNTK
{
template <typename ElementType>
static TensorView<ElementType>* AllocateTensorView(const NDShape& viewShape,
template<typename ElemType>
inline ElemType quiet_NaN()
{
return std::numeric_limits<ElemType>::quiet_NaN();
}
template<>
inline float16 quiet_NaN<float16>()
{
return float16(std::numeric_limits<float>::quiet_NaN());
}
template <typename V1ElemType>
static TensorView<V1ElemType>* AllocateTensorView(const NDShape& viewShape,
const DeviceDescriptor& device,
void* dataBuffer,
size_t bufferSizeInBytes)
@ -26,13 +38,13 @@ namespace CNTK
if (dataBuffer == nullptr)
InvalidArgument("Cannot create a NDArrayView over a null data buffer.");
if (bufferSizeInBytes < (viewShape.TotalSize() * sizeof(ElementType)))
if (bufferSizeInBytes < (viewShape.TotalSize() * sizeof(V1ElemType)))
InvalidArgument("Size (%d) of the specified buffer for creating the NDArrayView is smaller than the specified view shape '%S'.",
(int)bufferSizeInBytes, viewShape.AsString().c_str());
auto matrixDims = GetMatrixDimensions(viewShape);
std::shared_ptr<Matrix<ElementType>> matrix = std::make_shared<Matrix<ElementType>>(matrixDims.first, matrixDims.second, (ElementType*)dataBuffer, AsCNTKImplDeviceId(device), matrixFlagDontOwnBuffer);
return new TensorView<ElementType>(matrix, AsTensorViewShape(viewShape));
std::shared_ptr<Matrix<V1ElemType>> matrix = std::make_shared<Matrix<V1ElemType>>(matrixDims.first, matrixDims.second, (V1ElemType*)dataBuffer, AsCNTKImplDeviceId(device), matrixFlagDontOwnBuffer);
return new TensorView<V1ElemType>(matrix, AsTensorViewShape(viewShape));
}
static void* AllocateTensorView(CNTK::DataType dataType,
@ -47,26 +59,28 @@ namespace CNTK
return AllocateTensorView<float>(viewShape, device, dataBuffer, bufferSizeInBytes);
case DataType::Double:
return AllocateTensorView<double>(viewShape, device, dataBuffer, bufferSizeInBytes);
case DataType::Float16:
return AllocateTensorView<half>(viewShape, device, dataBuffer, bufferSizeInBytes);
default:
LogicError("Unsupported DataType %s", DataTypeName(dataType));
break;
}
}
template <typename ElementType>
static TensorView<ElementType>* AllocateTensorView(const NDShape& viewShape,
CNTK::StorageFormat storageType,
const DeviceDescriptor& device,
size_t numNonZeroValues = 0)
template<typename V1ElemType>
static TensorView<V1ElemType>* AllocateTensorView(const NDShape& viewShape,
CNTK::StorageFormat storageType,
const DeviceDescriptor& device,
size_t numNonZeroValues = 0)
{
auto matrixDims = GetMatrixDimensions(viewShape);
std::shared_ptr<Matrix<ElementType>> matrix = std::make_shared<Matrix<ElementType>>(matrixDims.first,
matrixDims.second,
AsCNTKImplDeviceId(device),
IsSparseStorageFormat(storageType) ? MatrixType::SPARSE : MatrixType::DENSE,
AsCNTKImplMatrixFormat(storageType),
numNonZeroValues);
return new TensorView<ElementType>(matrix, AsTensorViewShape(viewShape));
std::shared_ptr<Matrix<V1ElemType>> matrix = std::make_shared<Matrix<V1ElemType>>(matrixDims.first,
matrixDims.second,
AsCNTKImplDeviceId(device),
IsSparseStorageFormat(storageType) ? MatrixType::SPARSE : MatrixType::DENSE,
AsCNTKImplMatrixFormat(storageType),
numNonZeroValues);
return new TensorView<V1ElemType>(matrix, AsTensorViewShape(viewShape));
}
static void* AllocateTensorView(CNTK::DataType dataType,
@ -81,6 +95,8 @@ namespace CNTK
return AllocateTensorView<float>(viewShape, storageType, device, numNonZeroValues);
case DataType::Double:
return AllocateTensorView<double>(viewShape, storageType, device, numNonZeroValues);
case DataType::Float16:
return AllocateTensorView<half>(viewShape, storageType, device, numNonZeroValues);
default:
LogicError("Unsupported DataType %s", DataTypeName(dataType));
break;
@ -92,17 +108,37 @@ namespace CNTK
{
}
template <typename ElementType>
NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/)
: NDArrayView(AsDataType<ElementType>(), device, StorageFormat::SparseCSC, viewShape, false, AllocateTensorView<ElementType>(viewShape, StorageFormat::SparseCSC, device, numNonZeroValues))
NDArrayView::NDArrayView(CNTK::DataType dataType, const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const void* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/)
: NDArrayView(dataType, device, StorageFormat::SparseCSC, viewShape, false, AllocateTensorView(dataType, StorageFormat::SparseCSC, viewShape, device, numNonZeroValues * DataTypeSize(dataType)))
{
if ((colStarts == nullptr) || (rowIndices == nullptr) || (nonZeroValues == nullptr) || (numNonZeroValues == 0) || (numNonZeroValues > viewShape.TotalSize()))
InvalidArgument("Invalid sparse CSC format data specified for construction of NDArrayView with shape '%S'; "
"either one of the specified buffers is null or the count (%d) of non-zero values is invalid.",
viewShape.AsString().c_str(), (int)numNonZeroValues);
auto sparseMatrix = GetWritableMatrix<ElementType>(1);
sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
switch (dataType)
{
case DataType::Float:
{
auto sparseMatrix = GetWritableMatrix<float>(1);
sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, (const float*)nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
break;
}
case DataType::Double:
{
auto sparseMatrix = GetWritableMatrix<double>(1);
sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, (const double*)nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
break;
}
case DataType::Float16:
{
auto sparseMatrix = GetWritableMatrix<half>(1);
sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, (const half*)nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
break;
}
default:
LogicError("Unsupported DataType %s", DataTypeName(dataType));
break;
}
m_isReadOnly = readOnly;
}
@ -118,6 +154,9 @@ namespace CNTK
case DataType::Double:
delete GetTensorView<double>();
break;
case DataType::Float16:
delete GetTensorView<half>();
break;
default:
LogicError("Unsupported DataType %s", DataTypeName(m_dataType));
break;
@ -136,6 +175,8 @@ namespace CNTK
{
if (GetDataType() == DataType::Double)
SetValue((double)value);
else if (GetDataType() == DataType::Float16)
SetValue((float16)value);
else
{
if (IsSparse())
@ -167,12 +208,25 @@ namespace CNTK
auto currentMatrix = GetMatrix<double>();
return currentMatrix->IsView();
}
case DataType::Float16:
{
auto currentMatrix = GetMatrix<half>();
return currentMatrix->IsView();
}
}
return false;
}
template <typename ElementType>
/*static*/ std::shared_ptr<Matrix<ElementType>> NDArrayView::GetMatrixImpl(const TensorView<ElementType>* tensorView, size_t rowColSplitPoint)
void NDArrayView::SetValue(float16 value)
{
if (IsSparse())
LogicError("NDArrayView::SetValue: Setting a NDArrayView contents to a scalar is only allowed for objects with dense storage format.");
GetWritableMatrix<half>()->SetValue(*reinterpret_cast<half*>(&value));
}
template <typename V1ElemType>
/*static*/ std::shared_ptr<Matrix<V1ElemType>> NDArrayView::GetMatrixImpl(const TensorView<V1ElemType>* tensorView, size_t rowColSplitPoint)
{
auto tensorShape = tensorView->GetShape();
@ -213,34 +267,66 @@ namespace CNTK
return tensorView->Reshaped(tensorShape).AsMatrix();
}
template <typename ElementType>
std::shared_ptr<const Matrix<ElementType>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const
template <typename V1ElemType>
std::shared_ptr<const Matrix<V1ElemType>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const
{
return GetMatrixImpl<ElementType>(GetTensorView<ElementType>(), rowColSplitPoint);
return GetMatrixImpl<V1ElemType>(GetTensorView<V1ElemType>(), rowColSplitPoint);
}
template <typename ElementType>
std::shared_ptr<Matrix<ElementType>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/)
template <typename V1ElemType>
std::shared_ptr<Matrix<V1ElemType>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/)
{
return GetMatrixImpl<ElementType>(GetWritableTensorView<ElementType>(), rowColSplitPoint);
return GetMatrixImpl<V1ElemType>(GetWritableTensorView<V1ElemType>(), rowColSplitPoint);
}
template <typename ElementType>
const TensorView<ElementType>* NDArrayView::GetTensorView() const
std::shared_ptr<const MatrixBase> NDArrayView::GetMatrixBase(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const
{
if (AsDataType<ElementType>() != m_dataType)
LogicError("NDArrayView::GetTensorView: The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(m_dataType));
return (const TensorView<ElementType>*)(m_tensorView.get());
switch (m_dataType)
{
case DataType::Float:
return GetMatrixImpl<float>(GetTensorView<float>(), rowColSplitPoint);
case DataType::Double:
return GetMatrixImpl<double>(GetTensorView<double>(), rowColSplitPoint);
case DataType::Float16:
return GetMatrixImpl<half>(GetTensorView<half>(), rowColSplitPoint);
default:
LogicError("Unknown m_dataType %d", (int)m_dataType);
}
return nullptr;
}
template <typename ElementType>
TensorView<ElementType>* NDArrayView::GetWritableTensorView()
std::shared_ptr<MatrixBase> NDArrayView::GetWritableMatrixBase(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/)
{
switch (m_dataType)
{
case DataType::Float:
return GetMatrixImpl<float>(GetWritableTensorView<float>(), rowColSplitPoint);
case DataType::Double:
return GetMatrixImpl<double>(GetWritableTensorView<double>(), rowColSplitPoint);
case DataType::Float16:
return GetMatrixImpl<half>(GetWritableTensorView<half>(), rowColSplitPoint);
default:
LogicError("Unknown m_dataType %d", (int)m_dataType);
}
return nullptr;
}
template <typename V1ElemType>
const TensorView<V1ElemType>* NDArrayView::GetTensorView() const
{
if (AsDataType<V1ElemType>() != m_dataType)
LogicError("NDArrayView::GetTensorView: The specified ElementType %s does not match the DataType %s", typeid(V1ElemType).name(), DataTypeName(m_dataType));
return (const TensorView<V1ElemType>*)(m_tensorView.get());
}
template <typename V1ElemType>
TensorView<V1ElemType>* NDArrayView::GetWritableTensorView()
{
if (IsReadOnly())
InvalidArgument("NDArrayView::GetWritableTensorView: Cannot get a writable TensorView from a read-only NDArrayView.");
return const_cast<TensorView<ElementType>*>(GetTensorView<ElementType>());
return const_cast<TensorView<V1ElemType>*>(GetTensorView<V1ElemType>());
}
NDArrayViewPtr NDArrayView::DeepClone(const DeviceDescriptor& device, bool readOnly/* = false*/) const
@ -262,6 +348,13 @@ namespace CNTK
newMatrix->AssignValuesOf(*thisMatrix);
break;
}
case DataType::Float16:
{
auto newMatrix = newView->GetWritableMatrix<half>();
auto thisMatrix = GetMatrix<half>();
newMatrix->AssignValuesOf(*thisMatrix);
break;
}
default:
LogicError("NDArrayView::DeepClone: Unsupported DataType %s", DataTypeName(m_dataType));
break;
@ -296,6 +389,13 @@ namespace CNTK
destMatrix->AssignValuesOf(*sourceMatrix);
break;
}
case DataType::Float16:
{
auto sourceMatrix = source.GetMatrix<half>();
auto destMatrix = GetWritableMatrix<half>();
destMatrix->AssignValuesOf(*sourceMatrix);
break;
}
default:
LogicError("NDArrayView::CopyFrom: Unsupported DataType %s", DataTypeName(m_dataType));
break;
@ -313,6 +413,9 @@ namespace CNTK
case DataType::Double:
tensorView = new TensorView<double>(*(GetTensorView<double>()));
break;
case DataType::Float16:
tensorView = new TensorView<half>(*(GetTensorView<half>()));
break;
default:
LogicError("NDArrayView::Alias: Unsupported DataType %s", DataTypeName(m_dataType));
break;
@ -385,6 +488,19 @@ namespace CNTK
tensorView = new TensorView<double>(slicedMatrixView, AsTensorViewShape(sliceViewShape));
break;
}
case DataType::Float16:
{
auto currentMatrix = GetMatrix<half>();
std::pair<size_t, size_t> currentMatrixDims = { currentMatrix->GetNumRows(), currentMatrix->GetNumCols() };
std::shared_ptr<Matrix<half>> slicedMatrixView;
if (sliceViewMatrixDims.first != currentMatrixDims.first)
slicedMatrixView = make_shared<Matrix<half>>(currentMatrix->Reshaped(1, currentMatrix->GetNumElements()).ColumnSlice(flatBufferOffset, sliceViewShape.TotalSize()));
else
slicedMatrixView = make_shared<Matrix<half>>(currentMatrix->ColumnSlice(sliceMatrixColumnOffset, sliceViewMatrixDims.second));
tensorView = new TensorView<half>(slicedMatrixView, AsTensorViewShape(sliceViewShape));
break;
}
default:
LogicError("NDArrayView::SliceView: Unsupported DataType %s", DataTypeName(m_dataType));
break;
@ -412,6 +528,9 @@ namespace CNTK
case DataType::Double:
tensorView = new TensorView<double>(*(GetTensorView<double>()), newTensorShape);
break;
case DataType::Float16:
tensorView = new TensorView<half>(*(GetTensorView<half>()), newTensorShape);
break;
default:
LogicError("NDArrayView::AsShape: Unsupported DataType %s", DataTypeName(m_dataType));
break;
@ -430,9 +549,21 @@ namespace CNTK
return const_cast<ElementType*>(DataBuffer<ElementType>());
}
// TODO: This could actually be strided?
template <typename ElementType>
const ElementType* NDArrayView::DataBuffer() const
{
return const_cast<ElementType*>(_DataBuffer<ElementType, ElementType>());
}
template<>
const float16* NDArrayView::DataBuffer<float16>() const
{
return const_cast<float16*>(_DataBuffer<float16, half>());
}
// TODO: This could actually be strided?
template <typename ElementType, typename V1ElemType>
const ElementType* NDArrayView::_DataBuffer() const
{
if (AsDataType<ElementType>() != m_dataType)
InvalidArgument("NDArrayView::DataBuffer: The specified ElementType '%s' does not match this NDArrayView's DataType '%s'.", typeid(ElementType).name(), DataTypeName(m_dataType));
@ -441,13 +572,25 @@ namespace CNTK
InvalidArgument("The stroage format of 'this' NDArrayView is sparse. Please use SparseDataBuffers().");
// First make sure that the underlying matrix is on the right device
auto matrix = GetMatrix<ElementType>();
auto matrix = GetMatrix<V1ElemType>();
matrix->TransferToDeviceIfNotThere(AsCNTKImplDeviceId(m_device), true);
return matrix->Data();
return reinterpret_cast<ElementType*>(matrix->Data());
}
template <typename ElementType>
std::tuple<const ElementType *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers() const
{
return _SparseCSCDataBuffers<ElementType, ElementType>();
}
template <>
std::tuple<const float16 *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<float16>() const
{
return _SparseCSCDataBuffers<float16, half>();
}
template <typename ElementType, typename V1ElemType>
std::tuple<const ElementType *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::_SparseCSCDataBuffers() const
{
if (AsDataType<ElementType>() != m_dataType)
InvalidArgument("NDArrayView::SparseDataBuffers: The specified ElementType '%s' does not match this NDArrayView's DataType '%s'.", typeid(ElementType).name(), DataTypeName(m_dataType));
@ -458,7 +601,7 @@ namespace CNTK
if(GetStorageFormat() != StorageFormat::SparseCSC)
RuntimeError("The SparseCSCDataBuffers() method only supports CSC sparse format.");
std::shared_ptr<const Matrix<ElementType>> matrix = GetMatrix<ElementType>();
std::shared_ptr<const Matrix<V1ElemType>> matrix = GetMatrix<V1ElemType>();
auto matrixDims = GetMatrixDimensions(Shape());
if (matrix->GetNumRows() != matrixDims.first)
LogicError("The number of rows of the underlying matrix does not match the shape.");
@ -470,16 +613,16 @@ namespace CNTK
RuntimeError("NDArrayView::SparseDataBuffers: The underlying matrix of 'this' NDArrayView is not in the CSC sparse format.");
size_t numNonZeroValues;
ElementType* nonZeroValues;
V1ElemType* nonZeroValues;
SparseIndexType* colStarts;
SparseIndexType* rowIndices;
if (m_device.Type() == DeviceKind::CPU)
{
if (sizeof(CPUSPARSE_INDEX_TYPE) != sizeof(SparseIndexType))
LogicError("Inconsistent data type for sparse index in 'this' Value and the underlying matrix on CPU.");
std::shared_ptr<Microsoft::MSR::CNTK::CPUSparseMatrix<ElementType>> sparseMatrix = matrix->m_CPUSparseMatrix;
std::shared_ptr<Microsoft::MSR::CNTK::CPUSparseMatrix<V1ElemType>> sparseMatrix = matrix->m_CPUSparseMatrix;
numNonZeroValues = sparseMatrix->NzCount();
nonZeroValues = static_cast<ElementType *>(sparseMatrix->NzValues());
nonZeroValues = static_cast<V1ElemType *>(sparseMatrix->NzValues());
colStarts = static_cast<SparseIndexType *>(sparseMatrix->ColLocation());
rowIndices = static_cast<SparseIndexType *>(sparseMatrix->RowLocation());
}
@ -487,9 +630,9 @@ namespace CNTK
{
if (sizeof(GPUSPARSE_INDEX_TYPE) != sizeof(SparseIndexType))
LogicError("Inconsistent data type for sparse index in 'this' Value and the underlying matrix on GPU.");
std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<ElementType>> sparseMatrix = matrix->m_GPUSparseMatrix;
std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<V1ElemType>> sparseMatrix = matrix->m_GPUSparseMatrix;
numNonZeroValues = sparseMatrix->NzCount();
nonZeroValues = static_cast<ElementType *>(sparseMatrix->NzValues());
nonZeroValues = static_cast<V1ElemType *>(sparseMatrix->NzValues());
colStarts = static_cast<SparseIndexType *>(sparseMatrix->ColLocation());
rowIndices = static_cast<SparseIndexType *>(sparseMatrix->RowLocation());
}
@ -498,11 +641,23 @@ namespace CNTK
RuntimeError("NDArrayView::SparseDataBuffers: The device %S is currently not supported.",DeviceKindName(m_device.Type()));
}
return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t>(nonZeroValues, colStarts, rowIndices, numNonZeroValues);
return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t>(reinterpret_cast<ElementType*>(nonZeroValues), colStarts, rowIndices, numNonZeroValues);
}
template <typename ElementType>
std::tuple<const void *, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers() const
{
return _SparseBlockColumnDataBuffers<ElementType, ElementType>();
}
template <>
std::tuple<const void *, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<float16>() const
{
return _SparseBlockColumnDataBuffers<float16, half>();
}
template <typename ElementType, typename V1ElemType>
std::tuple<const void *, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::_SparseBlockColumnDataBuffers() const
{
if (AsDataType<ElementType>() != m_dataType)
InvalidArgument("NDArrayView::SparseBlockColumnDataBuffers: The specified ElementType '%s' does not match this NDArrayView's DataType '%s'.", typeid(ElementType).name(), DataTypeName(m_dataType));
@ -513,23 +668,23 @@ namespace CNTK
if (GetStorageFormat() != StorageFormat::SparseBlockCol)
RuntimeError("The SparseBlockColumnDataBuffers() method only supports sparse block column format.");
std::shared_ptr<const Matrix<ElementType>> matrix = GetMatrix<ElementType>();
std::shared_ptr<const Matrix<V1ElemType>> matrix = GetMatrix<V1ElemType>();
size_t numBlocks;
size_t numRows;
size_t numCols;
ElementType* blockValues;
V1ElemType* blockValues;
SparseIndexType* blockId2Col;
SparseIndexType* col2BlockId;
if (m_device.Type() == DeviceKind::GPU)
{
if (sizeof(GPUSPARSE_INDEX_TYPE) != sizeof(SparseIndexType))
LogicError("Inconsistent data type for sparse index in 'this' Value and the underlying matrix on GPU.");
std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<ElementType>> sparseMatrix = matrix->m_GPUSparseMatrix;
std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<V1ElemType>> sparseMatrix = matrix->m_GPUSparseMatrix;
numBlocks = sparseMatrix->GetBlockSize();
numRows = sparseMatrix->GetNumRows();
numCols = sparseMatrix->GetNumCols();
blockValues = static_cast<ElementType *>(sparseMatrix->NzValues());
blockValues = static_cast<V1ElemType *>(sparseMatrix->NzValues());
blockId2Col = static_cast<SparseIndexType *>(sparseMatrix->BlockId2ColOrRow());
col2BlockId = static_cast<SparseIndexType *>(sparseMatrix->ColOrRow2BlockId());
}
@ -539,7 +694,7 @@ namespace CNTK
RuntimeError("NDArrayView::SparseBlockColumnDataBuffers: The device %S is currently not supported.", DeviceKindName(m_device.Type()));
}
return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t, size_t, size_t>(blockValues, blockId2Col, col2BlockId, numBlocks, numRows, numCols);
return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t, size_t, size_t>(reinterpret_cast<ElementType*>(blockValues), blockId2Col, col2BlockId, numBlocks, numRows, numCols);
}
void NDArrayView::AdjustSparseBlockColumn(const SparseIndexType* cpuCol2BlockId, size_t numBlocks, bool useBlockId2Col)
@ -585,6 +740,13 @@ namespace CNTK
matrix->CollapseDataLocation();
break;
}
case DataType::Float16:
{
auto matrix = GetMatrix<half>();
matrix->TransferFromDeviceToDevice(matrix->GetDeviceId(), AsCNTKImplDeviceId(device), /*isBeingMoved = */ true, /*emptyTransfer =*/ false, /*updatePreferredDevice =*/ true);
matrix->CollapseDataLocation();
break;
}
default:
LogicError("NDArrayView::ChangeDevice: Unsupported DataType %s", DataTypeName(m_dataType));
break;
@ -594,33 +756,69 @@ namespace CNTK
}
template <typename ElementType>
/*static*/ NDArrayViewPtr NDArrayView::RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device /*= DeviceDescriptor::UseDefaultDevice()*/)
/*static*/ NDArrayViewPtr NDArrayView::RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device)
{
return NDArrayView::_RandomNormal<ElementType, ElementType>(shape, mean, stdDev, seed, device);
}
template <>
/*static*/ NDArrayViewPtr NDArrayView::RandomNormal<float16>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device)
{
return NDArrayView::_RandomNormal<float16, half>(shape, mean, stdDev, seed, device);
}
template <typename ElementType, typename V1ElemType>
/*static*/ NDArrayViewPtr NDArrayView::_RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device /*= DeviceDescriptor::UseDefaultDevice()*/)
{
auto matrixDims = GetMatrixDimensions(shape);
auto randomNormalMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomGaussian(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)mean, (ElementType)stdDev, seed));
auto tensorView = new TensorView<ElementType>(randomNormalMatrix, AsTensorViewShape(shape));
auto randomNormalMatrix = std::make_shared<Matrix<V1ElemType>>(Matrix<V1ElemType>::RandomGaussian(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (V1ElemType)mean, (V1ElemType)stdDev, seed));
auto tensorView = new TensorView<V1ElemType>(randomNormalMatrix, AsTensorViewShape(shape));
return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
}
template <typename ElementType>
/*static*/ NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/)
/*static*/ NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device)
{
return NDArrayView::_RandomUniform<ElementType, ElementType>(shape, rangeBegin, rangeEnd, seed, device);
}
template <>
/*static*/ NDArrayViewPtr NDArrayView::RandomUniform<float16>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device)
{
return NDArrayView::_RandomUniform<float16, half>(shape, rangeBegin, rangeEnd, seed, device);
}
template <typename ElementType, typename V1ElemType>
/*static*/ NDArrayViewPtr NDArrayView::_RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/)
{
auto matrixDims = GetMatrixDimensions(shape);
auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeBegin, (ElementType)rangeEnd, seed));
auto tensorView = new TensorView<ElementType>(randomUniformMatrix, AsTensorViewShape(shape));
auto randomUniformMatrix = std::make_shared<Matrix<V1ElemType>>(Matrix<V1ElemType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (V1ElemType)rangeBegin, (V1ElemType)rangeEnd, seed));
auto tensorView = new TensorView<V1ElemType>(randomUniformMatrix, AsTensorViewShape(shape));
return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
}
template <typename ElementType>
ElementType NDArrayView::AsScalar() const
{
return _AsScalar<ElementType, ElementType>();
}
template <>
float16 NDArrayView::AsScalar<float16>() const
{
return _AsScalar<float16, half>();
}
template <typename ElementType, typename V1ElemType>
ElementType NDArrayView::_AsScalar() const
{
auto scalarData = this->shared_from_this();
if (scalarData->Shape().TotalSize() != 1)
LogicError("NDArrayView::AsScalar: The NDArrayView shaped '%S' is not a scalar.", scalarData->Shape().AsString().c_str());
ElementType scalar = std::numeric_limits<ElementType>::quiet_NaN();
ElementType scalar = quiet_NaN<ElementType>();
std::shared_ptr<const NDArrayView> cpuData;
if (scalarData->Device() == DeviceDescriptor::CPUDevice())
cpuData = scalarData;
@ -635,6 +833,8 @@ namespace CNTK
scalar = *(cpuData->DataBuffer<float>());
else if (scalarData->GetDataType() == DataType::Double)
scalar = static_cast<ElementType>(*(cpuData->DataBuffer<double>()));
else if (scalarData->GetDataType() == DataType::Float16)
scalar = static_cast<ElementType>(*(cpuData->DataBuffer<float16>()));
else
LogicError("NDArrayView::AsScalar: Unsupported DataType");
@ -652,35 +852,44 @@ namespace CNTK
// Explicit template instantiations
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float16>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<double>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float16>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
template CNTK_API const float* NDArrayView::DataBuffer<float>() const;
template CNTK_API const double* NDArrayView::DataBuffer<double>() const;
template CNTK_API const float16* NDArrayView::DataBuffer<float16>() const;
template CNTK_API const TensorView<float>* NDArrayView::GetTensorView<float>() const;
template CNTK_API const TensorView<double>* NDArrayView::GetTensorView<double>() const;
template CNTK_API const TensorView<half>* NDArrayView::GetTensorView<half>() const;
template CNTK_API std::tuple<const float*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<float>() const;
template CNTK_API std::tuple<const double*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<double>() const;
template CNTK_API std::tuple<const float16*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<float16>() const;
template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<float>() const;
template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<double>() const;
template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<float16>() const;
template CNTK_API float* NDArrayView::WritableDataBuffer<float>();
template CNTK_API double* NDArrayView::WritableDataBuffer<double>();
template CNTK_API float16* NDArrayView::WritableDataBuffer<float16>();
template std::shared_ptr<const Matrix<float>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
template std::shared_ptr<const Matrix<double>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
template std::shared_ptr<const Matrix<half>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix<float>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix<double>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
template std::shared_ptr<Matrix<half>> NDArrayView::GetWritableMatrix<half>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
template TensorView<float>* NDArrayView::GetWritableTensorView<float>();
template TensorView<double>* NDArrayView::GetWritableTensorView<double>();
template TensorView<half>* NDArrayView::GetWritableTensorView<half>();
template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
template float NDArrayView::AsScalar<float>() const;
template double NDArrayView::AsScalar<double>() const;
template float16 NDArrayView::AsScalar<float16>() const;
}

Просмотреть файл

@ -53,8 +53,14 @@ namespace CNTK
}
else
{
// batch normalization on FP16 requires 32-bit scale/bias/mean/variance, so specialize that case
bool batchNormSpecialCase =
(op == PrimitiveOpType::BatchNormalization) &&
(outputDataType == DataType::Float16) &&
(inputDataType == DataType::Float);
// The DataType of all operands should match except for Constants where we allow coercion
if ((inputDataType != DataType::Unknown) && (inputDataType != outputDataType) && !input.IsConstant())
if ((inputDataType != DataType::Unknown) && (inputDataType != outputDataType) && !input.IsConstant() && !batchNormSpecialCase)
InvalidArgument("Primitive op '%S' passed operands '%S' with different DataTypes '%s' and '%s'.",
PrimitiveOpTypeName(op).c_str(), NamedListString(inputs).c_str(), DataTypeName(outputDataType), DataTypeName(inputDataType));
}
@ -70,7 +76,18 @@ namespace CNTK
for (auto& input : inputs)
{
if ((input.GetDataType() == DataType::Unknown) && (input.IsConstant() || input.IsParameter()))
input.m_dataFields->m_dataType = outputDataType;
{
// batch normalization on FP16 requires 32-bit scale/bias/mean/variance, so specialize that case
if ((op == PrimitiveOpType::BatchNormalization) &&
(outputDataType == DataType::Float16))
{
input.m_dataFields->m_dataType = DataType::Float;
}
else
{
input.m_dataFields->m_dataType = outputDataType;
}
}
}
}
@ -235,6 +252,10 @@ namespace CNTK
else
{
DataType outputDataType = GetOutputDataType(m_op, m_inputs, true);
if (m_op == PrimitiveOpType::Cast)
outputDataType = static_cast<DataType>(m_attributes[PrimitiveFunction::AttributeNameNewDataType].Value<int>());
std::vector<Axis> outputDynamicAxes = GetOutputDynamicAxes(m_op, m_inputs, this, m_attributes);
bool needsGradient = std::any_of(m_inputs.begin(), m_inputs.end(), [](const Variable& input) { return input.NeedsGradient(); });
@ -333,6 +354,7 @@ namespace CNTK
case PrimitiveOpType::ELU:
case PrimitiveOpType::StableSigmoid:
case PrimitiveOpType::ConstantOp:
case PrimitiveOpType::Cast:
assert(m_inputs.size() == 1);
outputShape = UnaryElementwiseOpOutputShape(m_inputs[0].Shape());
break;

Просмотреть файл

@ -116,6 +116,7 @@ namespace CNTK
{PrimitiveOpType::TopK, L"TopK"},
{PrimitiveOpType::ConstantOp, L"ConstantOp"},
{PrimitiveOpType::Squeeze, L"Squeeze"},
{PrimitiveOpType::Cast, L"Cast" },
};
inline const std::wstring& PrimitiveOpTypeName(PrimitiveOpType opType)

Просмотреть файл

@ -100,6 +100,7 @@ namespace CNTK
Squeeze = 88,
ConstantOp = 89,
LatticeSequenceWithSoftmax = 90,
Cast = 91,
// New op types should only be appended to the end of this list
UnknownOP
// and UnknownOP should always be last.

Просмотреть файл

@ -231,13 +231,17 @@ namespace CNTK
return DictionaryValue::Type(type);
}
template <typename T>
static void CopyData(const NDArrayView& src, RepeatedField<T>* dst)
template <typename SrcT, typename DstT=SrcT>
static void CopyData(const NDArrayView& src, RepeatedField<DstT>* dst)
{
auto size = src.Shape().TotalSize();
dst->Resize((int)size, T());
const T* buffer = src.DataBuffer<T>();
memcpy(dst->mutable_data(), buffer, (int)size * sizeof(T));
dst->Resize((int)size, DstT());
const SrcT* buffer = src.DataBuffer<SrcT>();
if (std::is_same<SrcT, DstT>::value)
memcpy(dst->mutable_data(), buffer, (int)size * sizeof(DstT));
else
for (size_t i = 0; i < size; i++)
dst->mutable_data()[i] = (DstT)buffer[i];
}
template <typename T>
@ -250,32 +254,44 @@ namespace CNTK
{
auto value = buffer[i];
if (tSize <= sizeof(uint32))
output.WriteLittleEndian32(Encode<T, uint32>(value));
else
{
output.WriteLittleEndian32(Encode<T, uint32>((float)value));
}
else
{
output.WriteLittleEndian64(Encode<T, uint64>(value));
}
}
}
template <typename T>
template <typename SrcT, typename DstT = SrcT>
static bool ReadData(RenewableCodedStream& input, NDArrayView& dst)
{
auto size = dst.Shape().TotalSize();
T* buffer = dst.WritableDataBuffer<T>();
DstT* buffer = dst.WritableDataBuffer<DstT>();
for (auto i = 0; i < size; i++)
{
if (!input.Read<T>(buffer+i))
SrcT value;
if (!input.Read<SrcT>(&value))
return false;
buffer[i] = (DstT)value;
}
return true;
}
template <typename T>
static void CopyData(const RepeatedField<T>& src, NDArrayView* dst)
template <typename SrcT, typename DstT = SrcT>
static void CopyData(const RepeatedField<SrcT>& src, NDArrayView* dst)
{
auto size = src.size();
assert(size == dst->Shape().TotalSize());;
T* buffer = dst->WritableDataBuffer<T>();
memcpy(buffer, src.data(), size * sizeof(T));
DstT* buffer = dst->WritableDataBuffer<DstT>();
if (std::is_same<SrcT, DstT>::value)
memcpy(buffer, src.data(), size * sizeof(SrcT));
else
{
for (size_t i = 0; i < size; i++)
buffer[i] = (DstT)src.data()[i];
}
}
@ -312,6 +328,10 @@ namespace CNTK
{
CopyData<double>(src, dst->mutable_double_values()->mutable_value());
}
else if (src.GetDataType() == DataType::Float16)
{
CopyData<float16, float>(src, dst->mutable_float_values()->mutable_value());
}
}
}
@ -328,6 +348,10 @@ namespace CNTK
{
WriteData<double>(src, output);
}
else if (src.GetDataType() == DataType::Float16)
{
WriteData<float16>(src, output);
}
}
}
@ -350,6 +374,11 @@ namespace CNTK
if (!ReadData<double>(wrapper, dst))
return false;
}
else if (dst.GetDataType() == DataType::Float16)
{
if (!ReadData<float, float16>(wrapper, dst))
return false;
}
}
return true;
}
@ -446,6 +475,13 @@ namespace CNTK
else
m_arrayViews.push_back({ dst, nullptr });
}
else if(dataType == DataType::Float16)
{
if (src.float_values().value().size() == shape->TotalSize())
CopyData<float, float16>(src.float_values().value(), dst);
else
m_arrayViews.push_back({ dst, nullptr });
}
return dst;
}

Просмотреть файл

@ -37,6 +37,7 @@ namespace CNTK
const std::wstring learningRateScheduleKey = L"learnig_rate_schedule";
const std::wstring smoothedGradientsKey = L"smoothed_gradients";
const std::wstring noiseInjectionSeedKey = L"noise_injection_seed";
const std::wstring masterParameterUpdatedKey = L"master_parameter_updated";
const std::wstring smoothedCountKey = L"smoothed_count";
const std::wstring stateKey = L"state";
const std::wstring rngSeedKey = L"rng_seed";

Просмотреть файл

@ -51,6 +51,10 @@ namespace CNTK
combinedFunctionArgs = m_model->Outputs();
combinedFunctionArgs.push_back(m_lossFunction);
if (m_lossFunction->Output().GetDataType() == DataType::Float16)
fprintf(stderr, "WARNING: using Float16 for loss function may cause overflow, please cast to float");
if (!m_lossFunction->Output().DynamicAxes().empty())
{
m_aggregatedLossFunction = ReduceSum(lossFunction, Axis::AllAxes(), L"aggregateLoss");
@ -226,6 +230,9 @@ namespace CNTK
// Gradients are not existing.
for (const auto& parameter : m_learnerParameters)
gradients[parameter] = nullptr;
trainingLoss = MakeSharedObject<NDArrayView>(0, (m_aggregatedLossFunction ? m_aggregatedLossFunction->Output().GetDataType() : DataType::Float), NDShape{}, computeDevice);
evalCriterion = MakeSharedObject<NDArrayView>(0, (m_aggregatedEvaluationFunction ? m_aggregatedEvaluationFunction->Output().GetDataType() : DataType::Float), NDShape{}, computeDevice);
}
else
{
@ -363,8 +370,10 @@ namespace CNTK
if (m_aggregatedLossFunction->Output().GetDataType() == DataType::Float)
m_rootGradientValue->Data()->SetValue(1.0f);
else
else if (m_aggregatedLossFunction->Output().GetDataType() == DataType::Double)
m_rootGradientValue->Data()->SetValue(1.0);
else
m_rootGradientValue->Data()->SetValue(half(1.0));
for (const auto& parameter : m_learnerParameters)
parameterGradients[parameter] = nullptr;
@ -541,7 +550,7 @@ namespace CNTK
return m_parameterLearners->ParameterLearners().front()->TotalNumberOfSamplesSeen();
default:
//should not be here; whenever a new data unit is defined, there should be a new case in this function.
LogicError("Unsupported data unit: %d", unit);
LogicError("Unsupported data unit: %d", (int)unit);
}
}

Просмотреть файл

@ -754,7 +754,7 @@ namespace CNTK
if (mask != nullptr)
Value::GetSequenceStartsAndLengths(mask, sequenceBeginIndices, sequenceLengths, numDynamicAxes);
bool hasTruncatedSequences = std::find_if(sequenceBeginIndices.begin(), sequenceBeginIndices.end(), [](const int& val) { return (val < 0); }) != sequenceBeginIndices.end();
bool hasTruncatedSequences = std::find_if(sequenceBeginIndices.begin(), sequenceBeginIndices.end(), [](const ptrdiff_t& val) { return (val < 0); }) != sequenceBeginIndices.end();
auto layout = std::make_shared<MBLayout>();
std::vector<std::pair<size_t, size_t>> placement;
@ -1178,12 +1178,15 @@ namespace CNTK
template std::pair<std::shared_ptr<const Matrix<float>>, MBLayoutPtr> Utils::GetCNTKImplMatrixAndMBLayoutFromValueObject<float>(const Variable& var, const ValuePtr& value, NDShape* inferredVarShape);
template std::pair<std::shared_ptr<const Matrix<double>>, MBLayoutPtr> Utils::GetCNTKImplMatrixAndMBLayoutFromValueObject<double>(const Variable& var, const ValuePtr& value, NDShape* inferredVarShape);
template std::pair<std::shared_ptr<const Matrix<half>>, MBLayoutPtr> Utils::GetCNTKImplMatrixAndMBLayoutFromValueObject<half>(const Variable& var, const ValuePtr& value, NDShape* inferredVarShape);
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(const NDShape& sampleShape, const std::vector<Axis>& sampleDynamicAxes, const Matrix<float>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(const NDShape& sampleShape, const std::vector<Axis>& sampleDynamicAxes, const Matrix<double>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<half>(const NDShape& sampleShape, const std::vector<Axis>& sampleDynamicAxes, const Matrix<half>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(const Variable& var, const ComputationNodeBasePtr& computationNode, const Matrix<float>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(const Variable& var, const ComputationNodeBasePtr& computationNode, const Matrix<double>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<half>(const Variable& var, const ComputationNodeBasePtr& computationNode, const Matrix<half>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
void Accumulator::Update(const ValuePtr& delta, const DeviceDescriptor& device)
{
@ -1206,8 +1209,10 @@ namespace CNTK
if (delta->GetDataType() == DataType::Float)
Data()->GetWritableTensorView<float>()->AddCopyOf(*delta->Data()->GetTensorView<float>());
else
else if(delta->GetDataType() == DataType::Double)
Data()->GetWritableTensorView<double>()->AddCopyOf(*delta->Data()->GetTensorView<double>());
else
RuntimeError("Unexpected data type in accumulator");
if (copied && m_numUpdates != 0)
RuntimeError("Accumulation values are created when accumulated num updates not zero");
@ -1228,8 +1233,10 @@ namespace CNTK
if (GetDataType() == DataType::Float)
Data()->SetValue(0.0f);
else
else if (GetDataType() == DataType::Double)
Data()->SetValue(0.0);
else
RuntimeError("Unsupported data type in Accumulator");
}
std::wstring DynamicAxesAsString(const std::vector<Axis>& axes, bool rowMajor)

Просмотреть файл

@ -742,9 +742,18 @@ namespace CNTK
template <typename T> //T can be Variable or StreamInfo
static bool IsAtSweepEnd(const std::unordered_map<T, MinibatchData>& arguments)
{
if (arguments.empty()) return true;
return std::any_of(arguments.begin(), arguments.end(), [](const std::pair<const T, MinibatchData>& kv)
{
return kv.second.sweepEnd;
});
}
// half is V1 ElemType, so specialize here instead of in CNTKLibrary.h
template<>
inline DataType AsDataType<half>()
{
return DataType::Float16;
}
}

Просмотреть файл

@ -108,7 +108,7 @@ namespace CNTK
}
colStarts[numCSCCols - 1] = (SparseIndexType)(nonZeroValues.size());
NDArrayViewPtr deviceValueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly);
NDArrayViewPtr deviceValueData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly);
return MakeSharedObject<Value>(deviceValueData, deviceValueMask);
}
@ -229,18 +229,7 @@ namespace CNTK
auto totalNumNonZeroValues = nonZeroValues.size() / DataTypeSize(dataType);
colStarts.push_back(totalNumNonZeroValues);
switch (dataType)
{
case DataType::Float:
// TODO: In case of sparse we can directly create on target device
valueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), (float*)nonZeroValues.data(), totalNumNonZeroValues, device, readOnly);
break;
case DataType::Double:
valueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), (double*)nonZeroValues.data(), totalNumNonZeroValues, device, readOnly);
break;
default:
NOT_IMPLEMENTED;
}
valueData = MakeSharedObject<NDArrayView>(dataType, valueDataShape, colStarts.data(), rowIndices.data(), (void*)nonZeroValues.data(), totalNumNonZeroValues, device, readOnly);
}
else
{
@ -372,7 +361,7 @@ namespace CNTK
/*static*/ ValuePtr Value::CreateSequence(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/)
{
auto sequenceShape = sampleShape.AppendShape({sequenceLength});
auto sequenceData = MakeSharedObject<NDArrayView>(sequenceShape, colStarts, rowIndices, nonZeroValues, numNonZeroValues, device, readOnly);
auto sequenceData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), sequenceShape, colStarts, rowIndices, nonZeroValues, numNonZeroValues, device, readOnly);
return Create(sampleShape, {sequenceData}, {sequenceStartFlag}, device, readOnly, false);
}
@ -683,6 +672,9 @@ namespace CNTK
case DataType::Double:
valueObject = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout(m_sampleShape, m_sampleDynamicAxes, *(m_packedData->GetMatrix<double>()), m_packedDataLayout, m_isReadOnly);
break;
case DataType::Float16:
valueObject = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout(m_sampleShape, m_sampleDynamicAxes, *(m_packedData->GetMatrix<half>()), m_packedDataLayout, m_isReadOnly);
break;
default:
LogicError("Unsupported DataType %s", DataTypeName(dataType));
}
@ -728,7 +720,7 @@ namespace CNTK
bool found = false;
for (size_t i = 0; i < sampleSize; i++)
{
if (*currentp == 1)
if (*currentp == (ElementType)1)
{
if (found)
RuntimeError("CopyDenseToOneHot: Cannot convert to onehot vector; more than one non-zero value in the sample.");
@ -736,7 +728,7 @@ namespace CNTK
index = i;
found = true;
}
else if (*currentp != 0)
else if (*currentp != (ElementType)0)
RuntimeError("CopyDenseToOneHot: Cannot convert to onehot vector; contains value other than 0/1.");
currentp++;
@ -753,26 +745,38 @@ namespace CNTK
// Explicit template instantiations
template /*static*/ CNTK_API ValuePtr Value::Create<float>(const NDShape& sampleShape, const std::vector<std::vector<float>>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::Create<double>(const NDShape& sampleShape, const std::vector<std::vector<double>>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::Create<float16>(const NDShape& sampleShape, const std::vector<std::vector<float16>>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::Create<float>(const NDShape& sampleShape, const std::vector<std::vector<size_t>>& oneHotSequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::Create<double>(const NDShape& sampleShape, const std::vector<std::vector<size_t>>& oneHotSequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::Create<float16>(const NDShape& sampleShape, const std::vector<std::vector<size_t>>& oneHotSequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float>(const NDShape& sampleShape, const std::vector<float>& batchData, const DeviceDescriptor& device, bool readOnly /*= false */);
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<double>(const NDShape& sampleShape, const std::vector<double>& batchData, const DeviceDescriptor& device, bool readOnly /*= false */);
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float16>(const NDShape& sampleShape, const std::vector<float16>& batchData, const DeviceDescriptor& device, bool readOnly /*= false */);
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float>(const NDShape& sampleShape, const std::vector<float>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly /*= false */);
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<double>(const NDShape& sampleShape, const std::vector<double>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly /*= false */);
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float16> (const NDShape& sampleShape, const std::vector<float16>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly /*= false */);
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float>(size_t dimension, const std::vector<size_t>& batchData, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<double>(size_t dimension, const std::vector<size_t>& batchData, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float16> (size_t dimension, const std::vector<size_t>& batchData, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float>(size_t dimension, const std::vector<size_t>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<double>(size_t dimension, const std::vector<size_t>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float16>(size_t dimension, const std::vector<size_t>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float>(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<double>(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float16>(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float16* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
template CNTK_API void Value::CopyVariableValueToVector<float>(const Variable& outputVariable, std::vector<std::vector<float>>& sequences);
template CNTK_API void Value::CopyVariableValueToVector<double>(const Variable& outputVariable, std::vector<std::vector<double>>& sequences);
template CNTK_API void Value::CopyVariableValueToVector<float16>(const Variable& outputVariable, std::vector<std::vector<float16>>& sequences);
template CNTK_API void Value::CopyVariableValueToVector<float>(const Variable& outputVariable, std::vector<std::vector<size_t>>& sequences);
template CNTK_API void Value::CopyVariableValueToVector<double>(const Variable& outputVariable, std::vector<std::vector<size_t>>& sequences);
template CNTK_API void Value::CopyVariableValueToVector<float16>(const Variable& outputVariable, std::vector<std::vector<size_t>>& sequences);
template CNTK_API std::tuple<size_t, size_t, size_t> Value::ValidateSparseCSCAndGetIndexBufferSizes<float>(const Variable& outputVariable);
template CNTK_API std::tuple<size_t, size_t, size_t> Value::ValidateSparseCSCAndGetIndexBufferSizes<double>(const Variable& outputVariable);
template CNTK_API std::tuple<size_t, size_t, size_t> Value::ValidateSparseCSCAndGetIndexBufferSizes<float16>(const Variable& outputVariable);
template CNTK_API void Value::CopyVariableValueToCSCSparse<float>(size_t sequenceLength, std::vector<SparseIndexType>& colStarts, std::vector<SparseIndexType>& rowIndices, std::vector<float>& nonZeroValues, size_t& numNonZeroValues);
template CNTK_API void Value::CopyVariableValueToCSCSparse<double>(size_t sequenceLength, std::vector<SparseIndexType>& colStarts, std::vector<SparseIndexType>& rowIndices, std::vector<double>& nonZeroValues, size_t& numNonZeroValues);
template CNTK_API void Value::CopyVariableValueToCSCSparse<float16>(size_t sequenceLength, std::vector<SparseIndexType>& colStarts, std::vector<SparseIndexType>& rowIndices, std::vector<float16>& nonZeroValues, size_t& numNonZeroValues);
template float Value::AsScalar<float>() const;
template double Value::AsScalar<double>() const;
template float16 Value::AsScalar<float16>() const;
}

Просмотреть файл

@ -135,6 +135,11 @@ namespace CNTK
m_dataFields->m_value = CreateValueFromParameterInitializer<double>(Shape(), *m_dataFields->m_valueInitializer, *m_dataFields->m_valueInitializationDevice);
break;
}
case DataType::Float16:
{
m_dataFields->m_value = CreateValueFromParameterInitializer<half>(Shape(), *m_dataFields->m_valueInitializer, *m_dataFields->m_valueInitializationDevice);
break;
}
default:
LogicError("Variable '%S' Value(): Unsupported DataType %s", AsString().c_str(), DataTypeName(GetDataType()));
break;
@ -496,7 +501,8 @@ namespace CNTK
DataType dataType = DataType(dict[dataTypeKey].Value<std::size_t>());
if (dataType != DataType::Unknown &&
dataType != DataType::Float &&
dataType != DataType::Double)
dataType != DataType::Double &&
dataType != DataType::Float16)
{
LogicError("Unexpected variable datatype '%ls':'%u' (%s).",
dataTypeKey.c_str(),

Просмотреть файл

@ -23,6 +23,7 @@ message NDArrayView {
Unknown = 0;
Float = 1;
Double = 2;
Float16 = 4;
}
enum StorageFormat {

Просмотреть файл

@ -1497,7 +1497,7 @@ FunctionPtr ONNXToCNTKHelper::CreateFunction(const Node *node, const std::vector
CNTK::PaddingMode cntkPaddingMode;
double cntkConstantValue = 0.0;
auto mode = GetNamedAttributeAsString(node, "mode", "constant");
std::transform(mode.begin(), mode.end(), mode.begin(), ::tolower);
std::transform(mode.begin(), mode.end(), mode.begin(), [](char v) { return (char)::tolower(v); });
if (mode == "constant")
cntkPaddingMode = CNTK::PaddingMode::CONSTANTPAD;
else if (mode == "reflect")

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>

Просмотреть файл

@ -34,6 +34,7 @@
#define WRITE_BUFFER_SIZE (1024 * 1024)
#include <boost/algorithm/string.hpp>
#include "half.hpp"
namespace Microsoft { namespace MSR { namespace CNTK {
@ -978,9 +979,11 @@ template <class ElemType>
template vector<float> File::LoadMatrixFromTextFile<float> (const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
template vector<double> File::LoadMatrixFromTextFile<double>(const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
template vector<half> File::LoadMatrixFromTextFile<half>(const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
template vector<float> File::LoadMatrixFromStringLiteral<float> (const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
template vector<double> File::LoadMatrixFromStringLiteral<double>(const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
template vector<half> File::LoadMatrixFromStringLiteral<half>(const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
#ifndef CNTK_COMPONENT_VERSION
#error CNTK_COMPONENT_VERSION must be set

Просмотреть файл

@ -296,7 +296,7 @@ struct utf8 : std::string
} // empty string
std::vector<char> buf(3 * len + 1); // max: 1 wchar => up to 3 mb chars
// ... TODO: this fill() should be unnecessary (a 0 is appended)--but verify
std::fill(buf.begin(), buf.end(), 0);
std::fill(buf.begin(), buf.end(), (char)0);
int rc = WideCharToMultiByte(CP_UTF8, 0, p.c_str(), (int) len,
&buf[0], (int) buf.size(), NULL, NULL);
if (rc == 0)
@ -335,7 +335,7 @@ static inline std::string wcstombs(const std::wstring& p) // output: MBCS
{
size_t len = p.length();
std::vector<char> buf(2 * len + 1); // max: 1 wchar => 2 mb chars
std::fill(buf.begin(), buf.end(), 0);
std::fill(buf.begin(), buf.end(), (char)0);
::wcstombs(&buf[0], p.c_str(), 2 * len + 1);
return std::string(&buf[0]);
}

Просмотреть файл

@ -159,12 +159,10 @@ public:
template <typename T>
File& operator<<(T val)
{
{
if (IsTextBased())
fputText(m_file, val);
else
fput(m_file, val);
}
if (IsTextBased())
fputText(m_file, val);
else
fput(m_file, val);
return *this;
}
File& operator<<(const std::wstring& val);

Просмотреть файл

@ -1158,8 +1158,10 @@ public:
void release(bool cpumode);
void setloglls(const Microsoft::MSR::CNTK::Matrix<float>& loglls);
void setloglls(const Microsoft::MSR::CNTK::Matrix<double>& loglls);
void setloglls(const Microsoft::MSR::CNTK::Matrix<half>& loglls);
void getgamma(Microsoft::MSR::CNTK::Matrix<float>& loglls);
void getgamma(Microsoft::MSR::CNTK::Matrix<double>& loglls);
void getgamma(Microsoft::MSR::CNTK::Matrix<half>& loglls);
};
// forward-backward function

Просмотреть файл

@ -2099,7 +2099,7 @@ static inline std::string wcstombs(const std::wstring& p) // output: MBCS
{
size_t len = p.length();
vector<char> buf(2 * len + 1); // max: 1 wchar => 2 mb chars
fill(buf.begin(), buf.end(), 0);
fill(buf.begin(), buf.end(), (char)0);
::wcstombs(&buf[0], p.c_str(), 2 * len + 1);
return std::string(&buf[0]);
}

Просмотреть файл

@ -47,6 +47,12 @@ vector<MemRequestInfo<double>>& MatrixPool::GetMemRequestInfoVec<double>()
return m_memRequestInfoDoubleVec;
}
template <>
vector<MemRequestInfo<half>>& MatrixPool::GetMemRequestInfoVec<half>()
{
return m_memRequestInfoHalfVec;
}
// -----------------------------------------------------------------------
// construction
// -----------------------------------------------------------------------
@ -129,6 +135,8 @@ void ComputationNetwork::SaveToFileImpl(const wstring& fileName, const FileOptio
precision = ElemTypeName<float>();
else if (nodePtr->Is<ComputationNode<double>>())
precision = ElemTypeName<double>();
else if (nodePtr->Is<ComputationNode<half>>())
precision = ElemTypeName<half>();
else LogicError("Unexpected node type.");
fstream << precision;
#endif
@ -229,7 +237,7 @@ void ComputationNetwork::ReadPersistableParameters(size_t modelVersion, File& fs
{
wstring precision;
if (modelVersion >= CNTK_MODEL_VERSION_7)
fstream >> precision; // "float" or "double"; default is "" meaning <ElemType> as passed in from outside
fstream >> precision; // "float" or "double" or "half"; default is "" meaning <ElemType> as passed in from outside
wstring opName, nodeName;
fstream >> opName >> nodeName;
@ -241,6 +249,8 @@ void ComputationNetwork::ReadPersistableParameters(size_t modelVersion, File& fs
node = ComputationNetworkBuilder<float>::NewNode(opName, m_deviceId, nodeName);
else if (precision == L"double")
node = ComputationNetworkBuilder<double>::NewNode(opName, m_deviceId, nodeName);
else if (precision == L"half")
node = ComputationNetworkBuilder<half>::NewNode(opName, m_deviceId, nodeName);
else if (precision == L"") // old file format: default to <ElemType>
node = ComputationNetworkBuilder<ElemType>::NewNode(opName, m_deviceId, nodeName);
else
@ -434,9 +444,10 @@ void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& n
{
randomSeed += GetRandomSeedOffset();
if (TryPostInitParameters<float> (node, initString, initValue, randomSeed, initOnCPUOnly) ||
TryPostInitParameters<double>(node, initString, initValue, randomSeed, initOnCPUOnly))
TryPostInitParameters<double>(node, initString, initValue, randomSeed, initOnCPUOnly) ||
TryPostInitParameters<half> (node, initString, initValue, randomSeed, initOnCPUOnly))
return;
LogicError("InitLearnableParameters: Input node is not a LearnableParameter<float or double>");
LogicError("InitLearnableParameters: Input node is not a LearnableParameter<float or double or half>");
}
// non-static version needed because it accesses m_randomSeedOffset
@ -666,6 +677,9 @@ void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net,
auto noded = dynamic_pointer_cast<ConvolutionNode<double>>(*nodeIter);
if (noded)
noded->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
auto nodeh = dynamic_pointer_cast<ConvolutionNode<half>>(*nodeIter);
if (nodeh)
nodeh->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
}
}
}
@ -1233,7 +1247,7 @@ void ComputationNetwork::SaveToDbnFile(ComputationNetworkPtr net, const std::wst
auto GetAllPriorNodes = [](ComputationNodeBasePtr node)->bool
{
std::wstring lowerName = node->GetName();
std::transform(lowerName.begin(), lowerName.end(), lowerName.begin(), ::tolower);
std::transform(lowerName.begin(), lowerName.end(), lowerName.begin(), [](wchar_t v) { return (wchar_t)::tolower(v); });
return node->OperationName() == OperationNameOf(LearnableParameter) && (lowerName.find(L"prior") != wstring::npos);
};
@ -1536,6 +1550,15 @@ template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net,
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
template void ComputationNetwork::SaveToDbnFile<double>(ComputationNetworkPtr net, const std::wstring& fileName) const;
template void ComputationNetwork::InitLearnableParametersWithBilinearFill<half>(const ComputationNodeBasePtr& node, size_t kernelWidth, size_t kernelHeight);
template void ComputationNetwork::Read<half>(const wstring& fileName);
template void ComputationNetwork::ReadPersistableParameters<half>(size_t modelVersion, File& fstream, bool create);
template void ComputationNetwork::PerformSVDecomposition<half>(const map<wstring, float>& SVDConfig, size_t alignedsize);
template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<half>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
template void ComputationNetwork::SetSeqParam<half>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
template void ComputationNetwork::SaveToDbnFile<half>(ComputationNetworkPtr net, const std::wstring& fileName) const;
// register ComputationNetwork with the ScriptableObject system
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNetwork> registerComputationNetwork(L"ComputationNetwork");

Просмотреть файл

@ -1309,10 +1309,12 @@ typedef ComputationNetwork::ComputationNetworkPtr ComputationNetworkPtr;
template <typename ElemType> static inline const wchar_t* ElemTypeName();
template <> /*static*/ inline const wchar_t* ElemTypeName<float>() { return L"float"; }
template <> /*static*/ inline const wchar_t* ElemTypeName<double>() { return L"double"; }
template <> /*static*/ inline const wchar_t* ElemTypeName<half>() { return L"half"; }
// The following emits the class and enables the BaseMatrix<double> to be available (used by EvalDll)
// The corresponding Matrix<float> is emitted in the SetDeviceId function above.
template class Matrix<double>;
template class Matrix<half>;
// TODOs:
// - automatic inference of time window w.r.t. delay nodes (and related nodes such as a temporal pooling)

Просмотреть файл

@ -226,9 +226,10 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
// this is used in V2
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape)
template <class ValueType>
shared_ptr<ComputationNode<ValueType>> ComputationNetworkBuilder<ElemType>::TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape)
{
return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, tensorShape));
return net.AddNodeToNetWithElemType(New<LearnableParameter<ValueType>>(net.GetDeviceId(), paramName, tensorShape));
}
// TODO: change these to take an actual object instead of a name for dynamicAxis
@ -964,5 +965,17 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Batch
template class ComputationNetworkBuilder<float>;
template class ComputationNetworkBuilder<double>;
template class ComputationNetworkBuilder<half>;
// V2 allows mixed precision
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::TypedCreateLearnableParameter<float>(const std::wstring& paramName, const TensorShape& tensorShape);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<float>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<float>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<double>::TypedCreateLearnableParameter<float>(const std::wstring& paramName, const TensorShape& tensorShape);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<double>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<float>(const std::wstring& paramName, const TensorShape& tensorShape);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);
}}}

Просмотреть файл

@ -45,7 +45,14 @@ public:
// TODO: separate into nodes that have inputs and those that duplicate functions with input adding except just not adding inputs. Clear?
ComputationNodePtr CreateLearnableParameter(const std::wstring& paramName, const size_t rows, const size_t cols);
ComputationNodePtr CreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); // V2
shared_ptr<ComputationNode<ElemType>> CreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape)
{
return this->template TypedCreateLearnableParameter<ElemType>(paramName, tensorShape);
}
template<class ValueType>
shared_ptr<ComputationNode<ValueType>> TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); // V2
// sparse matrix size is optionally specified
// ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0);
ComputationNodePtr CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");

Просмотреть файл

@ -111,7 +111,7 @@ ComputationNodeBasePtr ComputationNetwork::GetNestedNetwork(const ComputationNod
// concurrent computation in bulk CUDA launches.
// -----------------------------------------------------------------------
template<class ElemType> static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient);
static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient);
ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(const std::vector<shared_ptr<SEQTraversalFlowControlNode>>& recurrentInfo, const std::list<ComputationNodeBasePtr>& allNodes /*must be in eval order*/)
{
@ -152,7 +152,7 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
// Extreme Tracing, part 1/4
if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode())
DumpNode<float>(node, /*dumpGradient=*/false) || DumpNode<double>(node, false);
DumpNode(node, /*dumpGradient=*/false);
}
}
@ -188,7 +188,7 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
// Extreme Tracing, part 2/4
if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode() && node->NeedsGradient())
DumpNode<float>(node, /*dumpGradient=*/true) || DumpNode<double>(node, true);
DumpNode(node, /*dumpGradient=*/true);
}
}
/*virtual*/ void ComputationNetwork::PARTraversalFlowControlNode::RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) /*override*/
@ -207,26 +207,36 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
{
}
// helper for logging. Returns false if it was not able to dynamic-cast nodep to ComputationNode<ElemType>
template<class ElemType>
static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
template<typename ElemType>
bool TypedDumpNode(shared_ptr<ComputationNode<ElemType>> node, bool dumpGradient)
{
let node = dynamic_pointer_cast<ComputationNode<ElemType>>(nodep);
if (!node)
return false;
let dataPtr = dumpGradient ? node->GradientPtr() : node->ValuePtr();
if (!dataPtr)
return true; // e.g. SEQ sentinel node
bool concise = !(nodep->Environment().IsLogLevelNodeTrace());
bool concise = !(node->Environment().IsLogLevelNodeTrace());
fprintf(stderr, "Dump --> %s%s\n", node->FormatOperationPrototype("").c_str(), dumpGradient ? " Grad" : "");
node->WriteMinibatchWithFormatting(stderr, FrameRange(), SIZE_MAX, SIZE_MAX, false/*transpose*/, /*isCategoryLabel=*/false, /*isSparse=*/false, std::vector<std::string>(),
""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/,
"%13.10f"/*valueFormatString*/, dumpGradient, concise);
""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/,
"%13.10f"/*valueFormatString*/, dumpGradient, concise);
return true;
}
// helper for logging. Returns false if it was not able to dump
static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
{
let nodef = dynamic_pointer_cast<ComputationNode<float>>(nodep);
if (nodef) return TypedDumpNode<float>(nodef, dumpGradient);
let noded = dynamic_pointer_cast<ComputationNode<double>>(nodep);
if (noded) return TypedDumpNode<double>(noded, dumpGradient);
let nodeh = dynamic_pointer_cast<ComputationNode<half>>(nodep);
if (nodeh) return TypedDumpNode<half>(nodeh, dumpGradient);
return false;
}
// -----------------------------------------------------------------------
// SEQTraversalFlowControlNode methods -- implements SEQ traversal (loop unrolling)
//
@ -280,7 +290,7 @@ static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
{
if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode())
{
DumpNode<float>(node, /*dumpGradient=*/false) || DumpNode<double>(node, false);
DumpNode(node, /*dumpGradient=*/false);
}
}
}
@ -321,7 +331,7 @@ static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
{
if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode() && node->NeedsGradient())
{
DumpNode<float>(node, /*dumpGradient=*/true) || DumpNode<double>(node, true);
DumpNode(node, /*dumpGradient=*/true);
}
}
}

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>

Просмотреть файл

@ -16,7 +16,7 @@
namespace Microsoft { namespace MSR { namespace CNTK {
using namespace std;
using namespace std;
// -----------------------------------------------------------------------
// subroutines for evaluation
@ -85,18 +85,18 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
for (size_t i = 0; i < m_inputs.size(); i++)
{
ComputationNodePtr child = Input(i);
if (child->m_needsGradient &&
ComputationNodeBasePtr child = m_inputs[i];
if (child->NeedsGradient() &&
((childrenInThisLoop && child->IsPartOfLoop() == IsPartOfLoop()) ||
(childrenInOuterLoop && child->IsPartOfLoop() != IsPartOfLoop()) ))
{
// fprintf(stderr, "Backprop: %ls %ls operation -> child %d %ls %ls\n", NodeName().c_str(), OperationName().c_str(), (int)i, child->NodeName().c_str(), child->OperationName().c_str());
if (!m_needsGradient)
if (!NeedsGradient())
LogicError("%ls %ls operation has m_needsGradient set to false but children require it.", NodeName().c_str(), OperationName().c_str());
#if DUMPOUTPUT
fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str());
#endif
child->LazyZeroGradient(this); // set gradient to 0 if this is the first time
SMART_NODE_INVOKE(ComputationNode, child, LazyZeroGradient, this); // set gradient to 0 if this is the first time
// If we propagate from a loop to a node that is outside the loop, we are not efficient.
// This case is handled by SEQTraversalFlowControlNode::Backprop().
@ -108,7 +108,7 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
}
// before backprop, verify gradient optimization info
Input(i)->VerifyGradientOptimization(this);
SMART_NODE_INVOKE(ComputationNode, child, VerifyGradientOptimization, this);
// fprintf(stderr, "BackpropTo %d %d %ls %ls\n", (int)fr.timeIdxInSeq, (int)i, NodeName().c_str(), OperationName().c_str());
BackpropTo(i, fr); // this computes partial wrt to the child and sums the gradient value in the child
@ -753,11 +753,14 @@ template <class ElemType>
{
for (size_t i = 0; i < m_inputs.size(); i++)
{
ComputationNodePtr child = Input(i);
if (child->m_needsGradient)
ComputationNodeBasePtr child = m_inputs[i];
if (child->NeedsGradient())
{
child->MaskMissingGradientColumnsToZero(FrameRange(child->GetMBLayout())); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0
if (child->Gradient().HasNan("EndBackprop"))
SMART_NODE_INVOKE(ComputationNode, child, MaskMissingGradientColumnsToZero, FrameRange(child->GetMBLayout())); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0
bool hasNan = false;
SMART_NODE_INVOKE_WITH_RET(ComputationNode, child, Gradient().HasNan, hasNan, "EndBackprop");
if (hasNan)
{
LogicError("%ls %ls operation unexpectedly produced NaN gradients on its input %ls.", NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str());
}
@ -1004,7 +1007,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f,
double absSumLocal = 0;
for (size_t j = 0; j < jend; j++) // loop over elements
{
absSumLocal += abs(seqData[i * istride + j * jstride]);
absSumLocal += (double)abs(seqData[i * istride + j * jstride]);
}
absSum += absSumLocal;
}
@ -1136,6 +1139,7 @@ atomic_ullong TimeStamp::s_timeStampCounter = ATOMIC_VAR_INIT(0);
template <> map<size_t, map<size_t, shared_ptr<SingleMatrix>>> ComputationNode<float>::s_constOnes{};
template <> map<size_t, map<size_t, shared_ptr<DoubleMatrix>>> ComputationNode<double>::s_constOnes{};
template <> map<size_t, map<size_t, shared_ptr<HalfMatrix>>> ComputationNode<half>::s_constOnes{};
// -----------------------------------------------------------------------
// instantiate the core class templates
@ -1143,6 +1147,7 @@ template <> map<size_t, map<size_t, shared_ptr<DoubleMatrix>>> ComputationNode<d
template class ComputationNode<float>;
template class ComputationNode<double>;
template class ComputationNode<half>;
}}}

Просмотреть файл

@ -1375,7 +1375,7 @@ public:
m_inputs.resize(inputs.size());
for (size_t i = 0; i < m_inputs.size(); i++)
if (inputs[i])
m_inputs[i] = DownCast(inputs[i]); // (DownCast() checks the type; the assignment then downcasts it again)
m_inputs[i] = inputs[i]; // remove DownCast check here to allow CastNode to pass
else
m_inputs[i] = nullptr; // during network creation, nullptrs are possible
@ -1430,6 +1430,19 @@ protected:
return DownCast(m_inputs[inputIndex]);
}
template<typename InputType>
inline shared_ptr<ComputationNode<InputType>> TypedInput(const size_t inputIndex) const
{
if (inputIndex >= m_inputs.size())
LogicError("Inputs: inputIndex %d is out of range for %ls %ls operation.", (int)inputIndex, NodeName().c_str(), OperationName().c_str());
shared_ptr<ComputationNode<InputType>> node = dynamic_pointer_cast<ComputationNode<InputType>>(m_inputs[inputIndex]);
if (!node)
InvalidArgument("an TypedInput of mismatching precision was passed");
return node;
}
// Fast downcast without runtime type check of dynamic_pointer_cast.
// Meant to be used in Forward and BackPropTo, assuming that Validate() has already used Input() which validated the correct types.
inline ComputationNode<ElemType>& InputRef(const size_t inputIndex) const
@ -1929,24 +1942,36 @@ protected:
// if the matrix's size will scale with minibatch size, set mbScale = true
// if workspace flag is true, the memory request will be treated specially. We assume workspace memory will share their own pointers
// this is currently a workaround for workspace memory for convolutions
void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize=0, bool mbScale=false, bool isWorkSpace=false, bool aliasing=false)
template<typename ValueType>
void TypedRequestMatrixFromPool(shared_ptr<Matrix<ValueType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize=0, bool mbScale=false, bool isWorkSpace=false, bool aliasing=false)
{
if (matrixPtr == nullptr)
{
if (aliasing)
matrixPool.RequestAliasedAllocate<ElemType>(m_deviceId, this, &matrixPtr, matrixSize, mbScale);
matrixPool.RequestAliasedAllocate<ValueType>(m_deviceId, this, &matrixPtr, matrixSize, mbScale);
else
matrixPool.RequestAllocate<ElemType>(m_deviceId, &matrixPtr, matrixSize, mbScale, isWorkSpace);
matrixPool.RequestAllocate<ValueType>(m_deviceId, &matrixPtr, matrixSize, mbScale, isWorkSpace);
}
}
void ReleaseMatrixToPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, bool aliasing=false)
template<typename ValueType>
void TypedReleaseMatrixToPool(shared_ptr<Matrix<ValueType>>& matrixPtr, MatrixPool& matrixPool, bool aliasing=false)
{
assert(matrixPtr != nullptr);
if (aliasing)
matrixPool.RequestAliasedRelease<ElemType>(this);
matrixPool.RequestAliasedRelease<ValueType>(this);
else
matrixPool.RequestRelease<ElemType>(&matrixPtr);
matrixPool.RequestRelease<ValueType>(&matrixPtr);
}
void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize = 0, bool mbScale = false, bool isWorkSpace = false, bool aliasing = false)
{
TypedRequestMatrixFromPool<ElemType>(matrixPtr, matrixPool, matrixSize, mbScale, isWorkSpace, aliasing);
}
void ReleaseMatrixToPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, bool aliasing = false)
{
TypedReleaseMatrixToPool<ElemType>(matrixPtr, matrixPool, aliasing);
}
public:
@ -2114,7 +2139,7 @@ public:
s_constOnes[rows].find(cols) == s_constOnes[rows].end()) // not found
{
shared_ptr<Matrix<ElemType>> matrix = make_shared<Matrix<ElemType>>(rows, cols, (DEVICEID_TYPE) deviceId);
matrix->SetValue(1);
matrix->SetValue((ElemType)1);
s_constOnes[rows][cols] = matrix;
}
@ -2516,4 +2541,26 @@ public:
#pragma endregion base computation class
#define SMART_NODE_INVOKE(nodeClass, node, func, ...) \
do { \
if (dynamic_pointer_cast<nodeClass<float>>(node)) \
dynamic_pointer_cast<nodeClass<float>>(node)->func(__VA_ARGS__); \
else if (dynamic_pointer_cast<nodeClass<double>>(node)) \
dynamic_pointer_cast<nodeClass<double>>(node)->func(__VA_ARGS__); \
else if (dynamic_pointer_cast<nodeClass<half>>(node)) \
dynamic_pointer_cast<nodeClass<half>>(node)->func(__VA_ARGS__); \
else \
LogicError("Unknown nodeClass type"); \
} while(0)
#define SMART_NODE_INVOKE_WITH_RET(nodeClass, node, func, ret, ...) \
do { \
if (dynamic_pointer_cast<nodeClass<float>>(node)) \
ret = dynamic_pointer_cast<nodeClass<float>>(node)->func(__VA_ARGS__); \
else if (dynamic_pointer_cast<nodeClass<double>>(node)) \
ret = dynamic_pointer_cast<nodeClass<double>>(node)->func(__VA_ARGS__); \
else if (dynamic_pointer_cast<nodeClass<half>>(node)) \
ret = dynamic_pointer_cast<nodeClass<half>>(node)->func(__VA_ARGS__); \
else LogicError("Unknown ComputationNode type"); \
} while(0)
}}}

Просмотреть файл

@ -873,6 +873,7 @@ protected:
template class OneHotNode<float>;
template class OneHotNode<double>;
template class OneHotNode<half>;
#ifdef COMING_SOON

Просмотреть файл

@ -104,7 +104,7 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
static unsigned long randomSeed = 1;
int forcedRandomSeed = configp->Get(L"randomSeed"); // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
m_randomSeed = forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed;
m_initValueScale = configp->Get(L"initValueScale");
m_initValueScale = (ElemType)(float)configp->Get(L"initValueScale");
m_initFilterRank = configp->Get(L"initFilterRank");
m_initOutputRank = configp->Get(L"initOutputRank");
m_initOnCPUOnly = configp->Get(L"initOnCPUOnly");
@ -112,12 +112,12 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
else if (initString == L"zero")
{
m_initString = L"fromValue";
m_initValue = 0;
m_initValue = (ElemType)0;
}
else if (initString == L"fromValue") // from 'initValue'
{
m_initString = initString;
m_initValue = initValue;
m_initValue = (ElemType)(float)initValue;
}
else if (initString == L"bilinear")
{
@ -138,7 +138,7 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
else if (initString == L"fixedValue") // deprecated. Use initValue=... instead
{
m_initString = L"fromValue";
m_initValue = (ElemType)configp->Get(L"value");
m_initValue = (ElemType)(float)configp->Get(L"value");
}
else if (initString == L"fromLiteral") // deprecated. Use initValue=array instead
{
@ -549,7 +549,7 @@ void LearnableParameter<ElemType>::LazyInitParameters()
if (m_initString == L"fromValue")
{
if (GetEnvironmentPtr() && Environment().traceLevel > 0) // note: this will not log before node is part of network
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %f.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initValue);
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %f.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), (float)m_initValue);
Value().SetValue(m_initValue);
}
else if (ParseRandomizationType(m_initString).second != 0)
@ -651,5 +651,6 @@ template <class ElemType>
template class LearnableParameter<float>;
template class LearnableParameter<double>;
template class LearnableParameter<half>;
}}}

Просмотреть файл

@ -108,7 +108,7 @@ private:
{
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f(%f*%f), onCPU=%s.\n)",
NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(),
(int)randomSeed, (int)fanOut, (int)fanIn, range, range/initValueScale, initValueScale, initOnCPUOnly ? "true" : "false");
(int)randomSeed, (int)fanOut, (int)fanIn, (float)range, (float)(range/initValueScale), (float)(initValueScale), initOnCPUOnly ? "true" : "false");
}
}

Просмотреть файл

@ -41,6 +41,9 @@ template void Microsoft::MSR::CNTK::UpdateRunningAverage<float>(ComputationNode<
template void Microsoft::MSR::CNTK::UpdateRunningAverage<double>(ComputationNode<double>& newInput,
TensorView<double>& runningAverage,
size_t& runningCount);
template void Microsoft::MSR::CNTK::UpdateRunningAverage<half>(ComputationNode<half>& newInput,
TensorView<half>& runningAverage,
size_t& runningCount);
template <class ElemType>
EpochAccumulatorNode<ElemType>::EpochAccumulatorNode(DEVICEID_TYPE deviceId, const wstring& name)
@ -127,4 +130,5 @@ void EpochAccumulatorNode<ElemType>::Reset()
}
template class EpochAccumulatorNode<float>;
template class EpochAccumulatorNode<double>;
template class EpochAccumulatorNode<double>;
template class EpochAccumulatorNode<half>;

Просмотреть файл

@ -91,6 +91,7 @@ public:
template class PlusNode<float>;
template class PlusNode<double>;
template class PlusNode<half>;
// -----------------------------------------------------------------------
// LogPlusNode (summand1, summand2)
@ -165,6 +166,7 @@ public:
template class LogPlusNode<float>;
template class LogPlusNode<double>;
template class LogPlusNode<half>;
// -----------------------------------------------------------------------
@ -225,6 +227,7 @@ public:
template class PowNode<float>;
template class PowNode<double>;
template class PowNode<half>;
// -----------------------------------------------------------------------
@ -287,6 +290,7 @@ public:
template class MinusNode<float>;
template class MinusNode<double>;
template class MinusNode<half>;
// -----------------------------------------------------------------------
// ElementTimesNode (factor1, factor2)
@ -360,6 +364,7 @@ public:
template class ElementTimesNode<float>;
template class ElementTimesNode<double>;
template class ElementTimesNode<half>;
// -----------------------------------------------------------------------
// TimesNodeBase (A, B, outputRank=1)
@ -606,7 +611,7 @@ private:
Matrix<ElemType> inputValueSlice = unpackedInputValue.ColumnSlice(s * maxNumTimeSteps, maxNumTimeSteps); // k x s*
inputValueSlice.Reshape(k * maxNumTimeSteps, 1); // (k * s*) x 1
Matrix<ElemType> gradientSlice = Gradient().ColumnSlice(s, 1); // m x 1
Matrix<ElemType>::MultiplyAndWeightedAdd(1, gradientSlice, false, inputValueSlice, true, unpacked[inputIndex] ? 0 : beta, inputGradientSlice);
Matrix<ElemType>::MultiplyAndWeightedAdd(1, gradientSlice, false, inputValueSlice, true, unpacked[inputIndex] ? (ElemType)0 : beta, inputGradientSlice);
}
if (unpacked[inputIndex])
@ -624,7 +629,7 @@ private:
Matrix<ElemType> inputValueSlice = unpackedInputValue.ColumnSlice(s * maxNumTimeSteps, maxNumTimeSteps); // (m * k) x s*
inputValueSlice.Reshape(m, k * maxNumTimeSteps); // m x (k * s*)
Matrix<ElemType> gradientSlice = Gradient().ColumnSlice(s, 1); // m x 1
Matrix<ElemType>::MultiplyAndWeightedAdd(1, inputValueSlice, true, gradientSlice, false, unpacked[inputIndex] ? 0 : beta, inputGradientSlice);
Matrix<ElemType>::MultiplyAndWeightedAdd(1, inputValueSlice, true, gradientSlice, false, unpacked[inputIndex] ? (ElemType)0 : beta, inputGradientSlice);
}
if (unpacked[inputIndex])
@ -1113,6 +1118,7 @@ public:
template class TimesNode<float>;
template class TimesNode<double>;
template class TimesNode<half>;
// -----------------------------------------------------------------------
// TransposeTimesNode (A', B)
@ -1142,6 +1148,7 @@ public:
template class TransposeTimesNode<float>;
template class TransposeTimesNode<double>;
template class TransposeTimesNode<half>;
// Fixed-point matrix product. This scales inputs to 16bit signed integers by Symmetric quantizers, performs
// integer multiplication using SSE/AVX2, and transforms the results back.
@ -1232,6 +1239,7 @@ public:
template class QuantizedTimesNode<float>;
template class QuantizedTimesNode<double>;
template class QuantizedTimesNode<half>;
// -----------------------------------------------------------------------
// SumElementsNode (input)
@ -1467,6 +1475,7 @@ private:
template class TransposeDimensionsNode<float>;
template class TransposeDimensionsNode<double>;
template class TransposeDimensionsNode<half>;
// -----------------------------------------------------------------------
// CosDistanceNode (left, right)
@ -1587,6 +1596,7 @@ private:
template class CosDistanceNode<float>;
template class CosDistanceNode<double>;
template class CosDistanceNode<half>;
// -----------------------------------------------------------------------
// KhatriRaoProductNode (left, right)
@ -1913,6 +1923,7 @@ private:
template class CosDistanceWithNegativeSamplesNode<float>;
template class CosDistanceWithNegativeSamplesNode<double>;
template class CosDistanceWithNegativeSamplesNode<half>;
template <class ElemType>
void UpdateRunningAverage(ComputationNode<ElemType>& newInput, TensorView<ElemType>& runningAverage,
@ -1992,4 +2003,48 @@ protected:
size_t m_numSamples;
};
// -----------------------------------------------------------------------
// CastNode converts data types from InputType to ElemType
// -----------------------------------------------------------------------
template <class ElemType, class InputType>
class CastNode : public UnaryElementWiseNode<ElemType>
{
typedef UnaryElementWiseNode<ElemType> Base; UsingUnaryElementwiseNodeBaseMembers;
static const std::wstring TypeName() { return L"Cast"; }
public:
CastNode(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name)
{
}
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
{
auto result = ValueFor(fr);
auto input = static_cast<ComputationNode<InputType>&>(*m_inputs[0].get()).ValueFor(fr);
result.CastAssignValuesOf(input);
}
virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
{
auto grad = GradientFor(fr);
auto inputGrad = static_cast<ComputationNode<InputType>&>(*m_inputs[0].get()).GradientFor(fr);
inputGrad.CastAssignValuesOf(grad);
}
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
{
ValidateUnaryMap(isFinalValidationPass);
}
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
};
template class CastNode<half, float>;
template class CastNode<half, double>;
template class CastNode<float, half>;
template class CastNode<float, double>;
template class CastNode<double, half>;
template class CastNode<double, float>;
}}}

Просмотреть файл

@ -72,6 +72,7 @@ public:
protected:
vector<MemRequestInfo<float>> m_memRequestInfoFloatVec;
vector<MemRequestInfo<double>> m_memRequestInfoDoubleVec;
vector<MemRequestInfo<half>> m_memRequestInfoHalfVec;
set<DEVICEID_TYPE> m_deviceIDSet;
int m_stepCounter;
@ -151,6 +152,7 @@ public:
// MatrixPool is not templated, so we call both float and double versions here
OptimizedMemoryAllocationFunc<float>();
OptimizedMemoryAllocationFunc<double>();
OptimizedMemoryAllocationFunc<half>();
return;
}

Просмотреть файл

@ -446,6 +446,7 @@ public:
template class HardmaxNode<float>;
template class HardmaxNode<double>;
template class HardmaxNode<half>;
@ -517,7 +518,8 @@ public:
CreateMatrixIfNull(m_steps);
auto dim = Input(0)->GetSampleLayout().GetDimPadded(0);
auto tmp = new ElemType[numCols];
std::generate(tmp, tmp + numCols, [i = ElemType(0), dim]() mutable { auto ret = i; i += dim; return ret; });
ElemType i = ElemType(0);
std::generate(tmp, tmp + numCols, [&i, dim]() mutable { auto ret = i; i += dim; return ret; });
m_steps->SetValue(1, numCols, this->m_deviceId, tmp);
delete[] tmp;
m_sortedIndices->ScaleAndAdd(ElemType(1), *m_steps, *m_sortedIndices);
@ -768,7 +770,8 @@ public: \
}; \
\
template class ClassName<float>; \
template class ClassName<double>;
template class ClassName<double>; \
template class ClassName<half>;
DefineComparisonNode(LessNode, -1, 0)
DefineComparisonNode(EqualNode, 0, 0)

Просмотреть файл

@ -341,5 +341,6 @@ void OptimizedRNNStackNode<ElemType>::UnpackSequencesFromCuDNN(const Matrix<Elem
template class OptimizedRNNStackNode<float>;
template class OptimizedRNNStackNode<double>;
template class OptimizedRNNStackNode<half>;
}}}

Просмотреть файл

@ -587,9 +587,11 @@ template<class ElemType, int direction>
// instantiate the classes that derive from the above
template class PastValueNode<float>;
template class PastValueNode<double>;
template class PastValueNode<half>;
template class FutureValueNode<float>;
template class FutureValueNode<double>;
template class FutureValueNode<half>;
// -----------------------------------------------------------------------
// DelayedValueNodeState -- helper class for exporting/importing state from/to DelayedValueNodes.

Просмотреть файл

@ -373,6 +373,7 @@ template <class ElemType>
template class ReduceElementsNode<float>;
template class ReduceElementsNode<double>;
template class ReduceElementsNode<half>;
// -----------------------------------------------------------------------
// Where(bitVector) -- extract indices of non-0 values in a sequence
@ -490,6 +491,7 @@ template <class ElemType>
template class WhereNode<float>;
template class WhereNode<double>;
template class WhereNode<half>;
// -----------------------------------------------------------------------
// PackedIndexNode(targetObject, indexSequence) -- map sequence
@ -551,6 +553,7 @@ template <class ElemType>
template class PackedIndexNode<float>;
template class PackedIndexNode<double>;
template class PackedIndexNode<half>;
// -----------------------------------------------------------------------
// GatherPackedNode(packedIndex, sourceData) -- gather operation
@ -623,6 +626,7 @@ template <class ElemType>
template class GatherPackedNode<float>;
template class GatherPackedNode<double>;
template class GatherPackedNode<half>;
// -----------------------------------------------------------------------
// ScatterPackedNode(layoutData, packedIndex, sourceData) -- scatter operation
@ -686,6 +690,7 @@ template <class ElemType>
template class ScatterPackedNode<float>;
template class ScatterPackedNode<double>;
template class ScatterPackedNode<half>;
// -----------------------------------------------------------------------
// CropNode -- crop operation, crops first input according to shape of second
@ -1026,5 +1031,6 @@ bool CropNode<ElemType>::SupportsTransformOnInput(size_t inputIndex)
template class CropNode<float>;
template class CropNode<double>;
template class CropNode<half>;
}}}

Просмотреть файл

@ -878,8 +878,8 @@ public:
public:
PaddingNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector<size_t> head, std::vector<size_t> foot, PaddingType mode = PaddingType::CONSTANTPAD, ElemType constantValue = 0)
: Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value(constantValue)
PaddingNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector<size_t> head, std::vector<size_t> foot, PaddingType mode = PaddingType::CONSTANTPAD, double constantValue = 0)
: Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value((ElemType)constantValue)
{
}

Просмотреть файл

@ -312,8 +312,8 @@ class UnpackSequenceNode : public ComputationNodeNonLooping<ElemType>, public Mu
public:
DeclareConstructorFromConfig(UnpackSequenceNode);
UnpackSequenceNode(DEVICEID_TYPE deviceId, const wstring& name, ElemType paddingValue = 0, bool suppressMaskOutput = false)
: Base(deviceId, name), MultiOutputNode<ElemType>(suppressMaskOutput ? 1 : 2), m_paddingValue(paddingValue), m_suppressMaskOutput(suppressMaskOutput)
UnpackSequenceNode(DEVICEID_TYPE deviceId, const wstring& name, double paddingValue = 0, bool suppressMaskOutput = false)
: Base(deviceId, name), MultiOutputNode<ElemType>(suppressMaskOutput ? 1 : 2), m_paddingValue((ElemType)paddingValue), m_suppressMaskOutput(suppressMaskOutput)
{}
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override

Просмотреть файл

@ -144,5 +144,6 @@ template <class ElemType>
template class TraceNode<float>;
template class TraceNode<double>;
template class TraceNode<half>;
}}}

Просмотреть файл

@ -47,6 +47,7 @@ template <class ElemType>
template class RandomDistributionNode<float>;
template class RandomDistributionNode<double>;
template class RandomDistributionNode<half>;
template<class ElemType>
void RandomSampleNodeBase<ElemType>::Validate(bool isFinalValidationPass)
@ -108,9 +109,9 @@ void RandomSampleNodeBase<ElemType>::UpdateWeightsPrefixSum()
{
ElemType currentWeight = samplingWeights.GetValue(iClass, 0);
if (currentWeight < 0)
InvalidArgument("Sampling weights contain negative number %f.", currentWeight);
InvalidArgument("Sampling weights contain negative number %f.", (float)currentWeight);
runningWeightsSum += currentWeight;
runningWeightsSum += (double)currentWeight;
m_samplingWeightsPrefixSum.push_back(runningWeightsSum);
}
}
@ -231,6 +232,7 @@ bool RandomSampleNode<ElemType>::IsOutOfDateWrtInputs() const
template class RandomSampleNode<float>;
template class RandomSampleNode<double>;
template class RandomSampleNode<half>;
template<class ElemType>
double RandomSampleInclusionFrequencyNode<ElemType>::EstimateNumberOfTries()
@ -303,6 +305,7 @@ void RandomSampleInclusionFrequencyNode<ElemType>::Validate(bool isFinalValidati
template class RandomSampleInclusionFrequencyNode<float>;
template class RandomSampleInclusionFrequencyNode<double>;
template class RandomSampleInclusionFrequencyNode<half>;
template<class ElemType>
void DropoutNode<ElemType>::Save(File& fstream) const
@ -336,8 +339,10 @@ void BatchNormalizationNode<ElemType>::AttachInputs(const std::vector<Computatio
template class DropoutNode<float>;
template class DropoutNode<double>;
template class DropoutNode<half>;
template class BatchNormalizationNode<float>;
template class BatchNormalizationNode<double>;
template class BatchNormalizationNode<half>;
}}}

Просмотреть файл

@ -525,7 +525,7 @@ public:
{
Url& UrlJ = *itUrlJ;
discountJ = m_logWeights[UrlJ.m_rank];
if (abs(gainI - UrlJ.m_gain) < 0.0000001)
if (abs(gainI - UrlJ.m_gain) < (ElemType)0.0000001)
{
continue;
}
@ -534,7 +534,7 @@ public:
lambdaIJ = (gainI - UrlJ.m_gain) * (discountI - discountJ) / (discountI * discountJ);
// |delta NDCG|
lambdaIJ = (idealMetric == 0.0 ? (ElemType) 0.0 : abs(lambdaIJ / idealMetric));
lambdaIJ = (idealMetric == (ElemType)0.0 ? (ElemType) 0.0 : (ElemType)abs(lambdaIJ / idealMetric));
// Combine lambda
lambdaIJ = lambdas(0, pairsCount++) * lambdaIJ;
@ -2394,6 +2394,8 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName() { return L"BatchNormalization"; }
typedef typename std::conditional<std::is_same<ElemType, half>::value, float, ElemType>::type StatType;
// inputs
// TODO: Change all of these throughout the codebase to 'class enum'. Also change all places where we still use integer constants.
static const size_t DATA = 0;
@ -2412,7 +2414,7 @@ public:
m_one(1, 1, deviceId),
m_convertRunningVariancePending(false)
{
m_one.SetValue((ElemType)1); // (constant value used for GPU-side update of runCount)
m_one.SetValue((StatType)1); // (constant value used for GPU-side update of runCount)
}
BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
@ -2564,14 +2566,14 @@ private: // time-constant conversions
void ResetRunCount()
{
if (HasTiedRunCount())
Input(RUN_COUNT)->Value().SetValue(0);
this->template TypedInput<StatType>(RUN_COUNT)->Value().SetValue(0);
m_runCountUntied = 0;
}
void AggregateRunCount(size_t countToAdd)
{
if (HasTiedRunCount())
{
Input(RUN_COUNT)->Value().AddWithScaleOf(/*alpha=*/(ElemType)countToAdd, m_one); // this += countToAdd * (1)
this->template TypedInput<StatType>(RUN_COUNT)->Value().AddWithScaleOf(/*alpha=*/(StatType)countToAdd, m_one); // this += countToAdd * (1)
if (countToAdd != 0)
m_runCountUntied = SIZE_MAX; // we only need this for 0 checks, this value says we only know it's not 0
}
@ -2581,7 +2583,7 @@ private: // time-constant conversions
size_t RunCount() const // const version of above; keep identical
{
if (HasTiedRunCount())
m_runCountUntied = (size_t)Input(RUN_COUNT)->Value().Get00Element(); // if needed then cache it over
m_runCountUntied = (size_t)this->template TypedInput<StatType>(RUN_COUNT)->Value().Get00Element(); // if needed then cache it over
return m_runCountUntied;
}
bool IsRunCount0() const { return m_runCountUntied == 0 && RunCount() == 0; } // tied count >= untied one, so we can ask the untied one first to avoid GPU sync
@ -2664,10 +2666,10 @@ public:
FrameRange fr(Input(DATA)->GetMBLayout());
Matrix<ElemType> sliceInputValue = Input(DATA)->MaskedValueFor(fr);
const Matrix<ElemType>& scale = Input(SCALE)->Value();
const Matrix<ElemType>& bias = Input(BIAS)->Value();
Matrix<ElemType>& runMean = Input(RUN_MEAN)->Value();
Matrix<ElemType>& runVariance = Input(RUN_VAR)->Value();
const Matrix<StatType>& scale = this->template TypedInput<StatType>(SCALE)->Value();
const Matrix<StatType>& bias = this->template TypedInput<StatType>(BIAS)->Value();
Matrix<StatType>& runMean = this->template TypedInput<StatType>(RUN_MEAN)->Value();
Matrix<StatType>& runVariance = this->template TypedInput<StatType>(RUN_VAR)->Value();
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
assert(scale.GetNumRows() == bias.GetNumRows());
@ -2718,8 +2720,8 @@ public:
{
auto sliceOutputGrad = MaskedGradientFor(fr);
auto sliceInputValue = Input(DATA)->ValueFor(fr);
const Matrix<ElemType>& scale = Input(SCALE)->Value();
const Matrix<ElemType>& bias = Input(BIAS)->Value();
const Matrix<StatType>& scale = this->template TypedInput<StatType>(SCALE)->Value();
const Matrix<StatType>& bias = this->template TypedInput<StatType>(BIAS)->Value();
// If inputIndex is not DATA and we get here, then it means that DATA receives no gradient.
// However, the underlying engine does not foresee this case, and thus always needs a place
@ -2752,19 +2754,19 @@ public:
{
assert(m_gradientValid);
if (Input(SCALE)->IsGradientInitializedBy(this))
Input(SCALE)->Gradient().AssignValuesOf(*m_dScale);
if (this->template TypedInput<StatType>(SCALE)->IsGradientInitializedBy(this))
this->template TypedInput<StatType>(SCALE)->Gradient().AssignValuesOf(*m_dScale);
else
Input(SCALE)->Gradient() += *m_dScale;
this->template TypedInput<StatType>(SCALE)->Gradient() += *m_dScale;
}
else if (inputIndex == BIAS) // derivative with respect to the bias, precomputed during input derivative computation
{
assert(m_gradientValid);
if (Input(BIAS)->IsGradientInitializedBy(this))
Input(BIAS)->Gradient().AssignValuesOf(*m_dBias);
if (this->template TypedInput<StatType>(BIAS)->IsGradientInitializedBy(this))
this->template TypedInput<StatType>(BIAS)->Gradient().AssignValuesOf(*m_dBias);
else
Input(BIAS)->Gradient() += *m_dBias;
this->template TypedInput<StatType>(BIAS)->Gradient() += *m_dBias;
}
// No derivatives with respect to running mean and variance.
}
@ -2797,7 +2799,7 @@ public:
// running statistics inputs must be learnable parameters, since we update them directly here
for (size_t i = RUN_MEAN; i < GetNumInputs(); i++)
//if (!Input(i)->Is<LearnableParameter<ElemType>>()) // somehow this does not compile on gcc (works on VS)
if (!dynamic_cast<LearnableParameter<ElemType>*>(Input(i).get()))
if (!dynamic_cast<LearnableParameter<StatType>*>(this->template TypedInput<StatType>(i).get()))
InvalidArgument("%ls: Inputs [%d..%d] must be learnable parameters.", NodeDescription().c_str(), (int)RUN_MEAN, (int)GetNumInputs());
// infer dimensions of learnable parameters
@ -2807,7 +2809,7 @@ public:
#if 1 // Workaround for today's definition: Trigger on [0 x 1] and infer that 0 as the total # elements needed.
for (size_t i = SCALE; i < RUN_COUNT; i++) // scale, bias, run_mean, and run_variance
{
auto paramLayout = Input(i)->GetSampleLayout();
auto paramLayout = this->template TypedInput<StatType>(i)->GetSampleLayout();
if (paramLayout.GetRank() == 2 && paramLayout[0] == 0 && paramLayout[1] == 1 && inputLayout.GetNumElements() > 0) // [0 x 1]
{
size_t total = m_spatial ? inputLayout.GetDims().back() : inputLayout.GetNumElements();
@ -2841,10 +2843,11 @@ public:
// check inputs
for (size_t i = SCALE; i < RUN_COUNT; i++) // scale, bias, run_mean, and run_variance
{
if (Input(i)->HasMBLayout())
auto inputPtr = this->template TypedInput<StatType>(i);
if (inputPtr->HasMBLayout())
InvalidArgument("%ls: Input[%d] has a dynamic axis. BatchNormalization parameters cannot have that.", NodeDescription().c_str(), (int)i);
auto paramLayout = Input(i)->GetSampleLayout();
if (paramLayout != Input(SCALE)->GetSampleLayout())
auto paramLayout = inputPtr->GetSampleLayout();
if (paramLayout != this->template TypedInput<StatType>(SCALE)->GetSampleLayout())
InvalidArgument("%ls: Input[%d] has a layout different from Input[1]. All must be identical.", NodeDescription().c_str(), (int)i);
#if 0 // BUGBUG: For this to work, parameter shapes must be correct (cf. comment above on inference).
if (paramLayout.GetRank() > inputLayout.GetRank())
@ -2857,9 +2860,9 @@ public:
if (HasTiedRunCount()) // 0-th order statistics (count) (optional for backcompat with old code which didn't correctly share it)
{
// This must always be a [1] tensor. No inference allowed.
size_t i = RUN_COUNT;
if (Input(i)->HasMBLayout() || (Input(i)->GetSampleLayout().GetRank() > 1) || (Input(i)->GetSampleLayout().GetNumElements() != 1))
InvalidArgument("%ls: Input[%d] must be a vector of 1 element without dynamic axis.", NodeDescription().c_str(), (int)i);
auto inputPtr = this->template TypedInput<StatType>(RUN_COUNT);
if (inputPtr->HasMBLayout() || (inputPtr->GetSampleLayout().GetRank() > 1) || (inputPtr->GetSampleLayout().GetNumElements() != 1))
InvalidArgument("%ls: Input[RUN_COUNT] must be a vector of 1 element without dynamic axis.", NodeDescription().c_str());
RunCount(); // cache the shared value into the local cache, for 0 checks
}
if (m_spatial && m_imageLayoutKind != CHW)
@ -2902,7 +2905,7 @@ public:
if (m_bnEng == nullptr)
{
auto shape = GetSampleLayout();
m_bnEng = BatchNormEngine<ElemType>::Create(m_deviceId, shape, m_spatial, m_imageLayoutKind,
m_bnEng = BatchNormEngine<ElemType, StatType>::Create(m_deviceId, shape, m_spatial, m_imageLayoutKind,
m_useCntkEngine ? BatchNormEngineKind::Cntk : BatchNormEngineKind::CuDnn);
}
@ -2916,26 +2919,26 @@ public:
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
{
Base::RequestMatricesBeforeForwardProp(matrixPool);
RequestMatrixFromPool(m_savedMean, matrixPool);
RequestMatrixFromPool(m_savedInvStdDev, matrixPool);
this->template TypedRequestMatrixFromPool<StatType>(m_savedMean, matrixPool);
this->template TypedRequestMatrixFromPool<StatType>(m_savedInvStdDev, matrixPool);
}
void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
{
Base::RequestMatricesBeforeBackprop(matrixPool);
RequestMatrixFromPool(m_dDataDummy, matrixPool);
RequestMatrixFromPool(m_dScale, matrixPool);
RequestMatrixFromPool(m_dBias, matrixPool);
this->template TypedRequestMatrixFromPool<StatType>(m_dScale, matrixPool);
this->template TypedRequestMatrixFromPool<StatType>(m_dBias, matrixPool);
}
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_savedMean, matrixPool);
ReleaseMatrixToPool(m_savedInvStdDev, matrixPool);
this->template TypedReleaseMatrixToPool<StatType>(m_savedMean, matrixPool);
this->template TypedReleaseMatrixToPool<StatType>(m_savedInvStdDev, matrixPool);
ReleaseMatrixToPool(m_dDataDummy, matrixPool);
ReleaseMatrixToPool(m_dScale, matrixPool);
ReleaseMatrixToPool(m_dBias, matrixPool);
this->template TypedReleaseMatrixToPool<StatType>(m_dScale, matrixPool);
this->template TypedReleaseMatrixToPool<StatType>(m_dBias, matrixPool);
}
void SetNormalizationTimeConstants(double normalizationTimeConstant, double prevNormalizationTimeConstant,
@ -2970,8 +2973,8 @@ public:
// Turn off the L1 and L2 regularization
void DisableRegInBatchNormalization()
{
let scaleNode = dynamic_pointer_cast<LearnableParameter<ElemType>>(Input(SCALE));
let biasNode = dynamic_pointer_cast<LearnableParameter<ElemType>>(Input(BIAS));
let scaleNode = dynamic_pointer_cast<LearnableParameter<StatType>>(this->template TypedInput<StatType>(SCALE));
let biasNode = dynamic_pointer_cast<LearnableParameter<StatType>>(this->template TypedInput<StatType>(BIAS));
scaleNode->SetRegMultiplier(0.f);
biasNode->SetRegMultiplier(0.f);
}
@ -3046,20 +3049,20 @@ private:
// This value is not updated unless needed, so it may be out of date during most operation.
// It will be updated at start (Validate()) and saving models, and any time the true value is needed.
mutable size_t m_runCountUntied; // cached running sample count (mutable since it is a cache)
Matrix<ElemType> m_one; // constant [1x1] matrix that contains a 1 (used for updating the shared count)
Matrix<StatType> m_one; // constant [1x1] matrix that contains a 1 (used for updating the shared count)
// Interpolated actual mean/inverse stddev values. Pre-computed on forward pass, also used in gradient computation.
shared_ptr<Matrix<ElemType>> m_savedMean;
shared_ptr<Matrix<ElemType>> m_savedInvStdDev;
shared_ptr<Matrix<StatType>> m_savedMean;
shared_ptr<Matrix<StatType>> m_savedInvStdDev;
// Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls.
// Not used for blendFactor=1 in CNTK engine.
shared_ptr<Matrix<ElemType>> m_dDataDummy;
shared_ptr<Matrix<ElemType>> m_dScale;
shared_ptr<Matrix<ElemType>> m_dBias;
shared_ptr<Matrix<StatType>> m_dScale;
shared_ptr<Matrix<StatType>> m_dBias;
bool m_gradientValid = false;
std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
std::unique_ptr<BatchNormEngine<ElemType, StatType>> m_bnEng;
bool m_convertRunningVariancePending;
};

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
@ -33,13 +33,11 @@
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
@ -33,14 +33,12 @@
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CLRSupport>true</CLRSupport>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CLRSupport>true</CLRSupport>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
@ -64,7 +62,7 @@
<Link>
<AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
<AdditionalDependencies>Cntk.Eval-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;$(MSMPI_LIB64)msmpi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<DelayLoadDLLs>Cntk.Eval-$(CntkComponentVersion).dll;Cntk.Math-$(CntkComponentVersion).dll</DelayLoadDLLs>
<DelayLoadDLLs>Cntk.Eval-$(CntkComponentVersion).dll;Cntk.Math-$(CntkComponentVersion).dll;msmpi.dll</DelayLoadDLLs>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(DebugBuild)">

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
@ -33,13 +33,11 @@
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
@ -64,7 +62,7 @@
</ClCompile>
<Link>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration);$(OpenCvLibPath)</AdditionalLibraryDirectories>
<AdditionalDependencies>Cntk.Common-$(CntkComponentVersion).lib;$(OpenCvLib);%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>$(OpenCvLib);%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(DebugBuild)">

Просмотреть файл

@ -2,9 +2,7 @@
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="ImageWriter.cpp" />
<ClCompile Include="dllmain.cpp">
<Filter>Misc</Filter>
</ClCompile>
<ClCompile Include="dllmain.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="ImageWriter.h" />

Просмотреть файл

@ -6,13 +6,13 @@
#include "stdafx.h"
#include "BatchNormalizationEngine.h"
#include "CuDnnFactories.h"
#include "Mkl2017DnnCommon.h"
#include "MklDnnCommon.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev)
template <class InoutType, class StatType>
void BatchNormEngine<InoutType, StatType>::Forward(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
InoutMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev)
{
assert(in.GetNumRows() == m_inOutT.GetNumElements());
assert(out.GetNumRows() == m_inOutT.GetNumElements());
@ -63,9 +63,9 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
}
}
template <class ElemType>
void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor,
const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad)
template <class InoutType, class StatType>
void BatchNormEngine<InoutType, StatType>::Backward(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor,
const StatMat& savedMean, const StatMat& savedInvStdDev, StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad)
{
assert(!savedMean.IsEmpty());
assert(!savedInvStdDev.IsEmpty());
@ -73,12 +73,13 @@ void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat&
BackwardCore(in, srcGrad, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad);
}
template <class ElemType>
class CntkBatchNormEngine : public BatchNormEngine<ElemType>
template <class InoutType, class StatType>
class CntkBatchNormEngine : public BatchNormEngine<InoutType, StatType>
{
public:
using Base = BatchNormEngine<ElemType>;
using typename Base::Mat;
using Base = BatchNormEngine<InoutType, StatType>;
using typename Base::InoutMat;
using typename Base::StatMat;
public:
CntkBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
@ -99,28 +100,30 @@ protected:
InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout.");
}
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
void ForwardCore(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
InoutMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev) override
{
#ifdef USE_MKL2017DNN
if (in.GetCurrentMatrixLocation() == CPU &&
ForwardCoreMKL(in, scale, bias, inferenceOnly, expAvgFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev))
std::is_same<InoutType, StatType>::value &&
ForwardCoreMKL(*(const StatMat*)&in, scale, bias, inferenceOnly, expAvgFactor, runMean, runVariance, *(StatMat*)&out, epsilon, savedMean, savedInvStdDev))
return;
#endif
in.BatchNormalizationForward(scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev);
}
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) override
void BackwardCore(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& savedMean, const StatMat& savedInvStdDev,
StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) override
{
#ifdef USE_MKL2017DNN
if (srcGrad.GetCurrentMatrixLocation() == CPU &&
BackwardCoreMKL(in, srcGrad, grad, scale, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad))
std::is_same<InoutType, StatType>::value &&
BackwardCoreMKL(*(const StatMat*)&in, *(const StatMat*)&srcGrad, *(StatMat*)&grad, scale, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad))
return;
#endif
if (!accumulateDataGrad)
grad.SetValue((ElemType)0);
grad.SetValue((InoutType)0);
srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad);
}
@ -147,7 +150,7 @@ private:
struct MKLScaleShiftAdapter
{
bool isInput;
std::shared_ptr<Matrix<ElemType>> mat;
std::shared_ptr<Matrix<StatType>> mat;
dnnResourceType_t resourceType;
size_t numChannels;
@ -155,19 +158,19 @@ private:
{
Clear();
numChannels = n;
mat = std::make_shared<Matrix<ElemType>>(numChannels, 2, CPUDEVICE);
mat = std::make_shared<Matrix<StatType>>(numChannels, 2, CPUDEVICE);
isInput = userToPrim;
resourceType = rt;
}
void PrepareForExecution(void* scale, void* bias, void* resources[dnnResourceNumber])
{
ElemType* buffer = mat->Data();
StatType* buffer = mat->Data();
resources[resourceType] = buffer;
if (isInput)
{
memcpy(buffer, scale, sizeof(ElemType) * numChannels);
memcpy(buffer + numChannels, bias, sizeof(ElemType) * numChannels);
memcpy(buffer, scale, sizeof(StatType) * numChannels);
memcpy(buffer + numChannels, bias, sizeof(StatType) * numChannels);
}
}
@ -176,9 +179,9 @@ private:
if (isInput)
RuntimeError("Cannot execute output ResourceAdapter for input");
ElemType* buffer = mat->Data();
memcpy(scale, buffer, sizeof(ElemType) * numChannels);
memcpy(bias, buffer + numChannels, sizeof(ElemType) * numChannels);
StatType* buffer = mat->Data();
memcpy(scale, buffer, sizeof(StatType) * numChannels);
memcpy(bias, buffer + numChannels, sizeof(StatType) * numChannels);
}
void Clear()
@ -194,21 +197,21 @@ private:
struct PrimitiveContext
{
MKLDnnResourceAdapter<ElemType> input;
MKLDnnResourceAdapter<ElemType> output;
MKLDnnResourceAdapter<StatType> input;
MKLDnnResourceAdapter<StatType> output;
MKLScaleShiftAdapter scaleShift;
std::shared_ptr<Mat> varianceMat; // variance matrix used for converting InvStdDev
std::shared_ptr<StatMat> varianceMat; // variance matrix used for converting InvStdDev
dnnPrimitive_t primitive = nullptr;
dnnPrimitiveAttributes_t attributes = nullptr;
void Clear()
{
if (primitive) { dnnDelete<ElemType>(primitive); primitive = nullptr; }
if (primitive) { dnnDelete<StatType>(primitive); primitive = nullptr; }
input.Clear();
scaleShift.Clear();
output.Clear();
if (attributes) { dnnPrimitiveAttributesDestroy<ElemType>(attributes); attributes = nullptr; }
if (attributes) { dnnPrimitiveAttributesDestroy<StatType>(attributes); attributes = nullptr; }
}
~PrimitiveContext()
@ -219,7 +222,7 @@ private:
TensorShape m_shape;
size_t m_numSamples;
ElemType m_epsilon;
StatType m_epsilon;
public:
MKLBatchNormalizationContext() :
@ -233,12 +236,12 @@ private:
return !!(m_contextFlags & (1 << contextIndex));
}
void Prepare(const TensorShape& shape, bool spatial, size_t numSamples, ContextIndex contextIndex, ElemType epsilon = 0)
void Prepare(const TensorShape& shape, bool spatial, size_t numSamples, ContextIndex contextIndex, StatType epsilon = 0)
{
int flag = (1 << contextIndex);
if (contextIndex == ContextIndex_Backward)
{
epsilon = HasPreparedFor(ContextIndex_ForwardTrain) ? m_epsilon : (ElemType)DEFAULT_EPSILON;
epsilon = HasPreparedFor(ContextIndex_ForwardTrain) ? m_epsilon : (StatType)DEFAULT_EPSILON;
}
bool same = (shape == m_shape) && (numSamples == m_numSamples) && (epsilon == m_epsilon);
@ -285,10 +288,10 @@ private:
{
case ContextIndex_ForwardInfer:
case ContextIndex_ForwardTrain:
CHECK_MKL(dnnLayoutCreate<ElemType>(&ltUserInput, inoutDim, inoutSizes, inoutStrides));
CHECK_MKL(dnnLayoutCreate<ElemType>(&ltUserOutput, inoutDim, inoutSizes, inoutStrides));
CHECK_MKL(dnnPrimitiveAttributesCreate<ElemType>(&ctx.attributes));
CHECK_MKL(dnnBatchNormalizationCreateForward_v2<ElemType>(
CHECK_MKL(dnnLayoutCreate<StatType>(&ltUserInput, inoutDim, inoutSizes, inoutStrides));
CHECK_MKL(dnnLayoutCreate<StatType>(&ltUserOutput, inoutDim, inoutSizes, inoutStrides));
CHECK_MKL(dnnPrimitiveAttributesCreate<StatType>(&ctx.attributes));
CHECK_MKL(dnnBatchNormalizationCreateForward_v2<StatType>(
&ctx.primitive,
ctx.attributes,
ltUserInput,
@ -299,10 +302,10 @@ private:
scaleShiftType = dnnResourceScaleShift;
break;
case ContextIndex_Backward:
CHECK_MKL(dnnLayoutCreate<ElemType>(&ltUserInput, inoutDim, inoutSizes, inoutStrides));
CHECK_MKL(dnnLayoutCreate<ElemType>(&ltUserOutput, inoutDim, inoutSizes, inoutStrides));
CHECK_MKL(dnnPrimitiveAttributesCreate<ElemType>(&ctx.attributes));
CHECK_MKL(dnnBatchNormalizationCreateBackward_v2<ElemType>(
CHECK_MKL(dnnLayoutCreate<StatType>(&ltUserInput, inoutDim, inoutSizes, inoutStrides));
CHECK_MKL(dnnLayoutCreate<StatType>(&ltUserOutput, inoutDim, inoutSizes, inoutStrides));
CHECK_MKL(dnnPrimitiveAttributesCreate<StatType>(&ctx.attributes));
CHECK_MKL(dnnBatchNormalizationCreateBackward_v2<StatType>(
&ctx.primitive,
ctx.attributes,
ltUserInput,
@ -311,16 +314,16 @@ private:
inputType = dnnResourceDiffDst;
outputType = dnnResourceDiffSrc;
scaleShiftType = dnnResourceDiffScaleShift;
ctx.varianceMat = std::make_shared<Mat>(numChannels, 1, CPUDEVICE);
ctx.varianceMat = std::make_shared<StatMat>(numChannels, 1, CPUDEVICE);
break;
default:
RuntimeError("Unexpected context type %d", (int)contextIndex);
}
CHECK_MKL(dnnLayoutCreateFromPrimitive<ElemType>(&ltPrimInput, ctx.primitive, inputType));
CHECK_MKL(dnnLayoutCreateFromPrimitive<StatType>(&ltPrimInput, ctx.primitive, inputType));
ctx.input.Create(ltUserInput, ltPrimInput, inputType, true);
CHECK_MKL(dnnLayoutCreateFromPrimitive<ElemType>(&ltPrimOutput, ctx.primitive, outputType));
CHECK_MKL(dnnLayoutCreateFromPrimitive<StatType>(&ltPrimOutput, ctx.primitive, outputType));
ctx.output.Create(ltUserOutput, ltPrimOutput, outputType, false);
ctx.scaleShift.Create(scaleShiftType, contextIndex != ContextIndex_Backward, numChannels);
@ -338,7 +341,7 @@ private:
resources[dnnResourceMean] = runMean;
resources[dnnResourceVariance] = runVariance;
CHECK_MKL(dnnExecute<ElemType>(ctx.primitive, resources));
CHECK_MKL(dnnExecute<StatType>(ctx.primitive, resources));
ctx.output.ConvertOutput(output);
}
@ -352,16 +355,16 @@ private:
ctx.output.PrepareForExecution(grad, resources);
ctx.scaleShift.PrepareForExecution(scaleGrad, biasGrad, resources);
std::shared_ptr<Mat> scaleShiftMat;
scaleShiftMat = std::make_shared<Mat>(ctx.scaleShift.numChannels, 2, CPUDEVICE);
memcpy(scaleShiftMat->Data(), scale, ctx.scaleShift.numChannels * sizeof(ElemType));
std::shared_ptr<StatMat> scaleShiftMat;
scaleShiftMat = std::make_shared<StatMat>(ctx.scaleShift.numChannels, 2, CPUDEVICE);
memcpy(scaleShiftMat->Data(), scale, ctx.scaleShift.numChannels * sizeof(StatType));
resources[dnnResourceScaleShift] = scaleShiftMat->Data();
// convert from InvStdDev to variance
for (size_t i = 0; i < ctx.scaleShift.numChannels; i++)
{
ElemType& v = ctx.varianceMat->Data()[i];
ElemType& s = ((ElemType*)savedInvStdDev)[i];
StatType& v = ctx.varianceMat->Data()[i];
StatType& s = ((StatType*)savedInvStdDev)[i];
v = (1 / (s * s) - m_epsilon);
}
@ -369,7 +372,7 @@ private:
resources[dnnResourceMean] = savedMean;
resources[dnnResourceVariance] = ctx.varianceMat->Data();
CHECK_MKL(dnnExecute<ElemType>(ctx.primitive, resources));
CHECK_MKL(dnnExecute<StatType>(ctx.primitive, resources));
ctx.output.ConvertOutput(grad);
ctx.scaleShift.ConvertOutput(scaleGrad, biasGrad);
@ -377,15 +380,15 @@ private:
};
MKLBatchNormalizationContext m_mklContext;
std::shared_ptr<Mat> m_dataGradWorkspace;
std::shared_ptr<StatMat> m_dataGradWorkspace;
bool ForwardCoreMKL(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev)
bool ForwardCoreMKL(const StatMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, StatMat& runMean, StatMat& runVariance,
StatMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev)
{
ContextIndex contextIndex = inferenceOnly ?
ContextIndex_ForwardInfer :
ContextIndex_ForwardTrain;
m_mklContext.Prepare(m_inOutT, m_spatial, in.GetNumCols(), contextIndex, (ElemType)epsilon);
m_mklContext.Prepare(m_inOutT, m_spatial, in.GetNumCols(), contextIndex, (StatType)epsilon);
if (inferenceOnly)
{
@ -398,33 +401,33 @@ private:
m_mklContext.Forward(in.Data(), out.Data(), scale.Data(), bias.Data(), savedMean.Data(), savedInvStdDev.Data(), contextIndex);
// update savedMean, savedInvStdDev
ElemType OneMinusExpAvgFactor = (ElemType)(1.0 - expAvgFactor);
cblas_axpby((MKL_INT)runMean.GetNumElements(), (ElemType)expAvgFactor, savedMean.Data(), OneMinusExpAvgFactor, runMean.Data());
StatType OneMinusExpAvgFactor = (StatType)(1.0 - expAvgFactor);
cblas_axpby((MKL_INT)runMean.GetNumElements(), (StatType)expAvgFactor, savedMean.Data(), OneMinusExpAvgFactor, runMean.Data());
// note savedInvStdDev currently hold variance of in.Data(), need to convert to InvStdDev and interpolate
ElemType numReduced = (ElemType)(in.GetNumElements() / runVariance.GetNumElements());
ElemType bcf = numReduced / (numReduced - 1);
StatType numReduced = (StatType)(in.GetNumElements() / runVariance.GetNumElements());
StatType bcf = numReduced / (numReduced - 1);
for (size_t i = 0; i < runVariance.GetNumElements(); i++)
{
ElemType& v = runVariance.Data()[i];
ElemType& s = savedInvStdDev.Data()[i];
v = v * OneMinusExpAvgFactor + bcf * s * (ElemType)expAvgFactor;
s = (ElemType)1 / sqrt(s + (ElemType)epsilon);
StatType& v = runVariance.Data()[i];
StatType& s = savedInvStdDev.Data()[i];
v = v * OneMinusExpAvgFactor + bcf * s * (StatType)expAvgFactor;
s = (StatType)1 / sqrt(s + (StatType)epsilon);
}
}
return true;
}
bool BackwardCoreMKL(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale,
const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad)
bool BackwardCoreMKL(const StatMat& in, const StatMat& srcGrad, StatMat& grad, const StatMat& scale,
const StatMat& savedMean, const StatMat& savedInvStdDev, StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad)
{
m_mklContext.Prepare(m_inOutT, m_spatial, srcGrad.GetNumCols(), ContextIndex_Backward);
if (accumulateDataGrad)
{
if (!m_dataGradWorkspace)
m_dataGradWorkspace = std::make_shared<Matrix<ElemType>>(0, 0, CPUDEVICE);
m_dataGradWorkspace = std::make_shared<Matrix<StatType>>(0, 0, CPUDEVICE);
m_dataGradWorkspace->SetValue(grad);
}
@ -432,23 +435,24 @@ private:
m_mklContext.Backward(in.Data(), srcGrad.Data(), grad.Data(), scale.Data(), savedMean.Data(), savedInvStdDev.Data(), scaleGrad.Data(), biasGrad.Data());
if (accumulateDataGrad)
cblas_axpby((MKL_INT)grad.GetNumElements(), (ElemType)1.0, m_dataGradWorkspace->Data(), (ElemType)1.0, grad.Data());
cblas_axpby((MKL_INT)grad.GetNumElements(), (StatType)1.0, m_dataGradWorkspace->Data(), (StatType)1.0, grad.Data());
return true;
}
#endif
};
template class CntkBatchNormEngine<float>;
template class CntkBatchNormEngine<double>;
template class CntkBatchNormEngine<float, float>;
template class CntkBatchNormEngine<double, double>;
template class CntkBatchNormEngine<half, float>;
template <typename T> bool HasFlag(T src, T testFlag)
{
return ((int)src & (int)testFlag) != 0;
}
template <class ElemType>
std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
template <class InoutType, class StatType>
std::unique_ptr<BatchNormEngine<InoutType, StatType>> BatchNormEngine<InoutType, StatType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout,
BatchNormEngineKind enabledEngines)
{
@ -458,7 +462,7 @@ std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEV
if (GetMathLibTraceLevel() > 0)
fprintf(stderr, "Using CNTK batch normalization engine.\n");
return std::make_unique<CntkBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
return std::make_unique<CntkBatchNormEngine<InoutType, StatType>>(deviceId, inOutT, spatial, imageLayout);
}
if (HasFlag(enabledEngines, BatchNormEngineKind::CuDnn))
@ -466,13 +470,14 @@ std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEV
if (GetMathLibTraceLevel() > 0)
fprintf(stderr, "Using cuDNN batch normalization engine.\n");
return CuDnnBatchNormEngineFactory<ElemType>::Create(deviceId, inOutT, spatial, imageLayout);
return CuDnnBatchNormEngineFactory<InoutType, StatType>::Create(deviceId, inOutT, spatial, imageLayout);
}
RuntimeError("Could not find appropriate batch normalization engine.");
}
template class BatchNormEngine<float>;
template class BatchNormEngine<double>;
template class BatchNormEngine<float, float>;
template class BatchNormEngine<double, double>;
template class BatchNormEngine<half, float>;
}}}

Просмотреть файл

@ -25,22 +25,23 @@ enum class BatchNormEngineKind
#pragma warning(push)
#pragma warning(disable : 4251)
template <class ElemType>
template <class InoutType, class StatType = InoutType>
class MATH_API BatchNormEngine
{
public:
using Mat = Matrix<ElemType>;
using InoutMat = Matrix<InoutType>;
using StatMat = Matrix<StatType>;
public:
virtual ~BatchNormEngine() {};
void Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
void Forward(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
InoutMat& out, double epsilon, StatMat& saveMean, StatMat& saveInvStdDev);
void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad);
void Backward(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& saveMean, const StatMat& saveInvStdDev,
StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad);
static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
static std::unique_ptr<BatchNormEngine<InoutType, StatType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout,
BatchNormEngineKind enabledEngines = BatchNormEngineKind::All);
@ -56,11 +57,11 @@ protected:
virtual void EnsureCompatible() = 0;
// saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched
virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
virtual void ForwardCore(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
InoutMat& out, double epsilon, StatMat& saveMean, StatMat& saveInvStdDev) = 0;
virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) = 0;
virtual void BackwardCore(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& saveMean, const StatMat& saveInvStdDev,
StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) = 0;
protected:
DEVICEID_TYPE m_deviceId;

Просмотреть файл

@ -14,6 +14,7 @@
#include <ctime>
#include <limits.h>
#include "QuantizedOperations.h"
#include "half.hpp"
//#include "GPUMatrix.h"
//#include "CPUSparseMatrix.h"
@ -108,7 +109,9 @@ public:
const bool needAveMultiplier,
const bool initialized);
void AdaDelta(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon);
template<typename GradType>
void AdaDelta(CPUMatrix<GradType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon);
void AdaDeltaFlushTimestamps(size_t cols, ElemType rho, int* timestamps, int currentTimestamp);
void Reshape(const size_t numRows, const size_t numCols);
@ -417,10 +420,13 @@ public:
void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
CPUMatrix<ElemType>& grad, const bool poolIncludePad, bool accumulateGradient) const;
void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance,
CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
template<class StatType>
void BatchNormalizationForward(const CPUMatrix<StatType>& scale, const CPUMatrix<StatType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<StatType>& runMean, CPUMatrix<StatType>& runVariance,
CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<StatType>& saveMean, CPUMatrix<StatType>& saveInvStdDev) const;
template<class StatType>
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<StatType>& scale, double blendFactor, const CPUMatrix<StatType>& saveMean, const CPUMatrix<StatType>& saveInvStdDev,
CPUMatrix<StatType>& scaleGrad, CPUMatrix<StatType>& biasGrad) const;
public:
// This functions do not depend on <ElemType>, i.e. you can call them on any <ElemType>
@ -577,5 +583,30 @@ private:
typedef CPUMatrix<float> CPUSingleMatrix;
typedef CPUMatrix<double> CPUDoubleMatrix;
typedef CPUMatrix<half> CPUHalfMatrix;
template<typename ElemType>
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
template<typename ElemType>
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
template<typename ElemType>
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
template<typename ElemType>
void CPUMatrixTensorArgOpImpl(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
}}}

Просмотреть файл

@ -0,0 +1,132 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#include "CPUMatrixImpl.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// General conversion function with no performance optimization
// this should only be used in CPU half precision
// For performance on inference on CPU, user should convert fp16 model to fp32 first, unless MKL supports half precision
template<typename SrcT, typename DstT>
static void ConvertBuffer(DstT* dst, const SrcT* src, size_t count)
{
for (size_t i = 0; i < count; i++)
{
dst[i] = (DstT)src[i];
}
}
// specialization to convert from half to float for computation, and then store in half
template <>
void CPUMatrix<half>::MultiplyAndWeightedAdd(half alpha, const CPUMatrix<half>& a, const bool transposeA, const CPUMatrix<half>& b, const bool transposeB,
half beta, CPUMatrix<half>& c, shared_ptr<QuantizedMultiplier<half>> pQuantizedMultiplier)
{
CPUMatrix<float> af(a.GetNumRows(), a.GetNumCols());
CPUMatrix<float> bf(b.GetNumRows(), b.GetNumCols());
CPUMatrix<float> cf(c.GetNumRows(), c.GetNumCols());
if (alpha != 0)
{
ConvertBuffer<half, float>(af.Data(), a.Data(), a.GetNumElements());
ConvertBuffer<half, float>(bf.Data(), b.Data(), b.GetNumElements());
}
if (beta != 0)
{
ConvertBuffer<half, float>(cf.Data(), c.Data(), c.GetNumElements());
}
if (pQuantizedMultiplier)
RuntimeError("Quantized matrix multiply not supported for Half");
CPUMatrix<float>::MultiplyAndWeightedAdd((float)alpha, af, transposeA, bf, transposeB, (float)beta, cf, nullptr);
ConvertBuffer<float, half>(c.Data(), cf.Data(), c.GetNumElements());
}
// specialization to RunTimeError for now due to omp implementation only support build-in type
template <>
void CPUMatrix<half>::AssignSoftmaxSum(const CPUMatrix<half>& softmax, CPUMatrix<half>& c)
{
RuntimeError("half AssignSoftmaxSum not supported.");
}
template <>
void CPUMatrix<half>::AssignNCEUnnormalizedEval(const CPUMatrix<half>& a,
const CPUMatrix<half>& b, const CPUMatrix<half>& bias, CPUMatrix<half>& c)
{
RuntimeError("half AssignNCEUnnormalizedEval not supported.");
}
template <>
void CPUMatrix<half>::VectorSum(const CPUMatrix<half>& a, CPUMatrix<half>& c, const bool isColWise)
{
RuntimeError("half VectorSum not supported.");
}
template <>
void CPUMatrix<half>::VectorNorm1(CPUMatrix<half>& c, const bool isColWise) const
{
RuntimeError("half VectorNorm1 not supported.");
}
template <>
half CPUMatrix<half>::SumOfElements() const
{
RuntimeError("half SumOfElements not supported.");
}
template <>
half CPUMatrix<half>::MatrixNorm1() const
{
RuntimeError("half MatrixNorm1 not supported.");
}
template <>
half CPUMatrix<half>::FrobeniusNorm() const
{
RuntimeError("half FrobeniusNorm not supported.");
}
template <>
void CPUMatrix<half>::MaxPoolingBackward(const CPUMatrix<half>& out, const CPUMatrix<half>& in,
const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
CPUMatrix<half>& grad, bool accumulateGradient) const
{
RuntimeError("half MaxPoolingBackward not supported.");
}
template <>
void CPUMatrix<half>::MaxROIPoolingBackward(const size_t numRois, const size_t numImg, const size_t channels, const size_t width, const size_t height,
const size_t pooledWidth, const size_t pooledHeight, const CPUMatrix<half>& roiData, CPUMatrix<half>& grad,
CPUMatrix<half>& argmax, double spatialScale) const
{
RuntimeError("half MaxROIPoolingBackward not supported.");
}
template <>
void CPUMatrix<half>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<half>& grad, const bool poolIncludePad, bool accumulateGradient) const
{
RuntimeError("half AveragePoolingBackward not supported.");
}
// explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash.
template class MATH_API CPUMatrix<half>;
// instantiate templated methods
template void CPUMatrix<float>::AdaDelta(CPUMatrix<float>& gradients, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon);
template void CPUMatrix<double>::AdaDelta(CPUMatrix<double>& gradients, CPUMatrix<double>& functionValues, double learningRate, double rho, double epsilon);
template void CPUMatrix<float>::AdaDelta(CPUMatrix<half>& gradients, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon);
template void CPUMatrix<float>::BatchNormalizationForward(const CPUMatrix<float>& scale, const CPUMatrix<float>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<float>& runMean, CPUMatrix<float>& runVariance, CPUMatrix<float>& out, double epsilon, CPUMatrix<float>& saveMean, CPUMatrix<float>& saveInvStdDev) const;
template void CPUMatrix<double>::BatchNormalizationForward(const CPUMatrix<double>& scale, const CPUMatrix<double>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<double>& runMean, CPUMatrix<double>& runVariance, CPUMatrix<double>& out, double epsilon, CPUMatrix<double>& saveMean, CPUMatrix<double>& saveInvStdDev) const;
template void CPUMatrix<half>::BatchNormalizationForward(const CPUMatrix<float>& scale, const CPUMatrix<float>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<float>& runMean, CPUMatrix<float>& runVariance, CPUMatrix<half>& out, double epsilon, CPUMatrix<float>& saveMean, CPUMatrix<float>& saveInvStdDev) const;
template void CPUMatrix<float>::BatchNormalizationBackward(const CPUMatrix<float>& in, CPUMatrix<float>& grad, const CPUMatrix<float>& scale, double blendFactor, const CPUMatrix<float>& saveMean, const CPUMatrix<float>& saveInvStdDev, CPUMatrix<float>& scaleGrad, CPUMatrix<float>& biasGrad) const;
template void CPUMatrix<double>::BatchNormalizationBackward(const CPUMatrix<double>& in, CPUMatrix<double>& grad, const CPUMatrix<double>& scale, double blendFactor, const CPUMatrix<double>& saveMean, const CPUMatrix<double>& saveInvStdDev, CPUMatrix<double>& scaleGrad, CPUMatrix<double>& biasGrad) const;
template void CPUMatrix<half>::BatchNormalizationBackward(const CPUMatrix<half>& in, CPUMatrix<half>& grad, const CPUMatrix<float>& scale, double blendFactor, const CPUMatrix<float>& saveMean, const CPUMatrix<float>& saveInvStdDev, CPUMatrix<float>& scaleGrad, CPUMatrix<float>& biasGrad) const;
}}}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,30 @@
#include "stdafx.h"
#include "CPUMatrixTensorImpl.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template
void CPUMatrixTensorOpImpl(double beta, const CPUMatrix<double>& a, CPUMatrix<double>& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
template
void CPUMatrixTensorOpImpl(double beta, const CPUMatrix<double>& a, const CPUMatrix<double>& b, CPUMatrix<double>& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
template
void CPUMatrixTensorOpImpl(double beta, const CPUMatrix<double>& a, const CPUMatrix<double>& b, const CPUMatrix<double>& c, CPUMatrix<double>& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
template
void CPUMatrixTensorArgOpImpl(const CPUMatrix<double>& a, CPUMatrix<double>& o, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
}}}

Просмотреть файл

@ -0,0 +1,30 @@
#include "stdafx.h"
#include "CPUMatrixTensorImpl.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template
void CPUMatrixTensorOpImpl(float beta, const CPUMatrix<float>& a, CPUMatrix<float>& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
template
void CPUMatrixTensorOpImpl(float beta, const CPUMatrix<float>& a, const CPUMatrix<float>& b, CPUMatrix<float>& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
template
void CPUMatrixTensorOpImpl(float beta, const CPUMatrix<float>& a, const CPUMatrix<float>& b, const CPUMatrix<float>& c, CPUMatrix<float>& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
template
void CPUMatrixTensorArgOpImpl(const CPUMatrix<float>& a, CPUMatrix<float>& o, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
}}}

Просмотреть файл

@ -0,0 +1,30 @@
#include "stdafx.h"
#include "CPUMatrixTensorImpl.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template
void CPUMatrixTensorOpImpl(half beta, const CPUMatrix<half>& a, CPUMatrix<half>& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
template
void CPUMatrixTensorOpImpl(half beta, const CPUMatrix<half>& a, const CPUMatrix<half>& b, CPUMatrix<half>& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
template
void CPUMatrixTensorOpImpl(half beta, const CPUMatrix<half>& a, const CPUMatrix<half>& b, const CPUMatrix<half>& c, CPUMatrix<half>& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
template
void CPUMatrixTensorArgOpImpl(const CPUMatrix<half>& a, CPUMatrix<half>& o, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
}}}

Просмотреть файл

@ -0,0 +1,519 @@
// Move some files out of CPUMatrixImpl.h to prevent compiler crash on out-of-heap
#include "CPUMatrix.h"
#include "TensorOps.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// =======================================================================
// TensorView support
// =======================================================================
// To save time, this makes extensive use of templates and macros.
// -----------------------------------------------------------------------
// function to compute the value for a given output location (perform reduction if needed)
// -----------------------------------------------------------------------
// perform loop over reduction index m
// This function is declared inside a wrapper struct to allow partial specialization (m = -1).
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int m>
struct TensorOpReduction
{
// reduction case (non-reduction case is specialized)
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& reductionOp,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
array<ptrdiff_t, N - 1> strides; // N-1 because last one is the result pointer, which is unused in reduction
for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled
strides[i] = reducingStrides[i][(size_t) m];
double aggregate = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;)
{
// advance the pointers
for (size_t i = 0; i < N - 1; i++)
pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here
// need to descend into one loop deeper
aggregate = reductionOp(aggregate, TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides));
}
// Actually it would be nicer to return double but we keep ElementType so that test don't return different numbers than previous implementation.
return static_cast<ElemType>(aggregate);
}
};
// perform loop over reduction index m
// This is the specialized version for m = -1, which terminates the recursion.
template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
struct TensorOpReduction<ElemType, OPFN, ReductionOp, N, -1>
{
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& /*reductionOp*/,
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&)
{
return opfn(pointers); // finally we are doing some work!!!
}
};
// perform loop over reduction index m, while keeping track of the number of elements and their corresponding indices.
// This function is declared inside a wrapper struct to allow partial specialization (m = -1).
template <class ElemType, size_t N, int m>
struct TensorArgOpReduction
{
static inline std::pair<ElemType, size_t> ReduceAll(array<ElemType*, N> pointers, const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides,
ElementWiseOperator reductionOp)
{
size_t counter = 0;
size_t index = 0;
ElemType val = (ElemType)0;
switch (reducingOpDims.size())
{
case 3:
val = TensorArgOpReduction<ElemType, N, 2>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
break;
case 2:
val = TensorArgOpReduction<ElemType, N, 1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
break;
case 1:
val = TensorArgOpReduction<ElemType, N, 0>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
break;
case 0:
val = TensorArgOpReduction<ElemType, N, -1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
break;
default:
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)reducingOpDims.size());
}
return make_pair(val, index);
}
// reduction case (non-reduction case is specialized)
static inline ElemType Loop(array<ElemType*, N> pointers, const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides,
ElementWiseOperator reductionOp, size_t& counter, size_t& index)
{
array<ptrdiff_t, N - 1> strides; // N-1 because last one is the result pointer, which is unused in reduction
for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled
strides[i] = reducingStrides[i][(size_t)m];
ElemType aggregate = TensorArgOpReduction<ElemType, N, m - 1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;)
{
// advance the pointers
for (size_t i = 0; i < N - 1; i++)
pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here
ElemType val = TensorArgOpReduction<ElemType, N, m - 1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
bool update = false;
switch (reductionOp)
{
case ElementWiseOperator::opArgmin:
update = (aggregate > val);
break;
case ElementWiseOperator::opArgmax:
update = (aggregate < val);
break;
}
if (update)
{
aggregate = val;
index = counter - 1;
}
}
return aggregate;
}
};
// perform loop over reduction index m
// This is the specialized version for m = -1, which terminates the recursion.
template <class ElemType, size_t N>
struct TensorArgOpReduction<ElemType, N, -1>
{
static inline ElemType Loop(array<ElemType*, N> pointers,
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&, ElementWiseOperator /*reductionOp*/, size_t& counter, size_t& /*index*/)
{
counter++;
return *pointers[0]; // finally we are doing some work!!!
}
};
// -----------------------------------------------------------------------
// perform loop over regular index k for N-nary operations (N counting the output)
// -----------------------------------------------------------------------
// perform loop over regular index k and reducing index m for N operands (counting the output)
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m, int k>
struct TensorOpIteration
{
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
// non-scalar case: still nested result loops left
array<ptrdiff_t, N> strides;
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
strides[i] = regularStrides[i][(size_t) k];
for (size_t dim = regularOpDims[(size_t) k]; dim-- > 0;)
{
// need to descend into one loop deeper
TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// advance the pointers
for (size_t i = 0; i < N; i++)
pointers[i] += strides[i];
}
}
};
// Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE.
// This is a very common case, e.g. adding vectors or computing the Sigmoid.
template <class ElemType, typename OPFN, typename ReductionOp>
struct TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
{
static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
{
ElemType* pa = pointers[0];
ElemType* pb = pointers[1];
ElemType* pc = pointers[2];
size_t K = regularOpDims[0];
// special-case beta and alpha to allow the compiler to short-circuit it
if (beta != 0)
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else if (alpha != 1)
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// TODO: According to Amit, the VS compiler is not able to vectorize into lambdas. Solution: change the lambda to take an N, or to implement the loop inside (with 1 element by default).
// TODO: The signedness of k (required for omp) causes an extra sign-extend.
// TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it?
}
};
// and unary
template <class ElemType, typename OPFN, typename ReductionOp>
struct TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
{
static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
ElemType* pa = pointers[0];
ElemType* pb = pointers[1];
size_t K = regularOpDims[0];
// special-case beta and alpha to allow the compiler to short-circuit it
if (beta != 0)
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else if (alpha != 1)
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
};
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m>
struct TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, -1>
{
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
// we are at element level for the result: perform the op (there may still be reduction)
ElemType val = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
// scale
val *= alpha;
// combine with previous value in target matrix, then write it out
auto* pout = pointers.back();
if (beta != 0)
val += beta * *pout;
// save
*pout = val;
return;
}
};
// perform loop over regular index k and reducing index m for N operands (counting the output), the difference
// between TensorOpIteration and TensorArgOpIteration, is that the latter store the index of the result, instead of
// the result. The reason that they aren't combined is because of performance.
template <class ElemType, size_t N, int k>
struct TensorArgOpIteration
{
static inline void Loop(array<ElemType*, N> pointers,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, ElementWiseOperator reductionOp)
{
// non-scalar case: still nested result loops left
array<ptrdiff_t, N> strides;
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
strides[i] = regularStrides[i][(size_t)k];
for (size_t dim = regularOpDims[(size_t)k]; dim-- > 0;)
{
// need to descend into one loop deeper
TensorArgOpIteration<ElemType, N, k - 1>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
// advance the pointers
for (size_t i = 0; i < N; i++)
pointers[i] += strides[i];
}
}
};
template <class ElemType, size_t N>
struct TensorArgOpIteration<ElemType, N, -1>
{
static inline void Loop(array<ElemType*, N> pointers,
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, ElementWiseOperator reductionOp)
{
// we are at element level for the result: perform the op (there may still be reduction)
auto val = TensorArgOpReduction<ElemType, N, 2>::ReduceAll(pointers, reducingOpDims, reducingStrides, reductionOp);
auto* pout = pointers.back();
*pout = (ElemType)val.second;
return;
}
};
// -----------------------------------------------------------------------
// map runtime parameters N to template parameters
// -----------------------------------------------------------------------
// tensor operation with k+1 dimensions (-1 means scalar)
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int k>
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, const OPFN& opfn, ReductionOp reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
size_t dims = reducingOpDims.size();
switch (dims)
{
case 2:
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1:
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0:
{
// if all leading dimensions are 1, we can let the compiler do some unrolling
bool leadingAllOne = true;
for (size_t i = 0; i < N; i++)
leadingAllOne &= k >= 0 && regularStrides[i][0] == 1;
if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, true /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
default:
LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int) dims);
}
}
// tensor operation, generalized in number of arguments, operation already provided as a lambda
// This function now expands into different k.
template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
static void TensorOpWithFnAndReduction(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
const array<size_t, N>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
pointers[i] += offsets[i];
size_t dims = regularOpDims.size();
switch (dims)
{
// N.B. consider code size impact when adding more cases.
case 5:
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 4>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 4:
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 3>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 3:
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 2>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 2:
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1:
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 0>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0:
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, -1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
default:
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims);
}
}
// tensor operation, generalized in number of arguments, operation already provided as a lambda
// This function now expands into different reductionOps
template <class ElemType, typename OPFN, size_t N>
static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, ElementWiseOperator reductionOp,
const array<size_t, N>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
// BUGBUG: Using always 'double' as type of aggregator even for ElemType==float. Reason: otherwise some e2e test would fail as historically we
// used double for aggregator of sum. But:
// * for min and max reductions this is meaningless.
// * It is not consitent with what we do on GPU, there we aggregate on ElemType.
// * It costs performance.
// TODO: apdapt e2e tests to run with aggregator of type ElemType.
#define CaseTensorOpWithFnAndReduction(oper) \
case ElementWiseOperator::op##oper: \
return TensorOpWithFnAndReduction(beta, pointers, alpha, opfn, [](double a, double b) \
{ \
return Op##oper(a, b); \
}, \
offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
switch (reductionOp)
{
CaseTensorOpWithFnAndReduction(Sum);
CaseTensorOpWithFnAndReduction(LogSum);
CaseTensorOpWithFnAndReduction(Min);
CaseTensorOpWithFnAndReduction(Max);
CaseTensorOpWithFnAndReduction(ElementwiseProduct);
default:
LogicError("Specified ElementWiseOperator op %d not supported as reduction operation.", (int)reductionOp);
}
}
// -----------------------------------------------------------------------
// entry points from Matrix.cpp; also map op to a lambda
// -----------------------------------------------------------------------
// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This maps 'op' to a lambda.
template <class ElemType>
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum &&
reductionOp != ElementWiseOperator::opLogSum &&
reductionOp != ElementWiseOperator::opMin &&
reductionOp != ElementWiseOperator::opMax &&
reductionOp != ElementWiseOperator::opElementwiseProduct)
InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
// TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
#define CaseUnaryTensorOp(oper) \
case ElementWiseOperator::op##oper: \
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 2>& pp) \
{ \
return Op##oper((*(pp[0]))); \
}, \
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
array<ElemType*, 2> pointers = {a.Data(), o.Data()};
switch (op)
{
ForAllUnaryOps(CaseUnaryTensorOp);
default:
LogicError("TensorOp: Unknown unary op code %d.", (int) op);
}
}
// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This maps 'op' to a lambda.
template <class ElemType>
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum)
InvalidArgument("TensorOp (binary): The only permitted binary reduction operation is opSum.");
#define CaseBinaryTensorOp(oper) \
case ElementWiseOperator::op##oper: \
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3>& pp) \
{ \
return Op##oper((*(pp[0])), (*(pp[1]))); \
}, \
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
array<ElemType*, 3> pointers = {a.Data(), b.Data(), o.Data()};
switch (op)
{
ForAllBinaryOps(CaseBinaryTensorOp);
default:
LogicError("TensorOp: Unknown op binary code %d.", (int) op);
}
}
// perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This maps 'op' to a lambda.
template <class ElemType>
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum)
InvalidArgument("TensorOp: The only permitted ternary reduction operation is opSum.");
#define CaseTernaryTensorOp(oper) \
case ElementWiseOperator::op##oper: \
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4>& pp) \
{ \
return Op##oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); \
}, \
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
array<ElemType*, 4> pointers = {a.Data(), b.Data(), c.Data(), o.Data()};
switch (op)
{
ForAllTernaryOps(CaseTernaryTensorOp);
default:
LogicError("TensorOp: Unknown ternary op code %d.", (int) op);
}
}
template <class ElemType>
void CPUMatrixTensorArgOpImpl(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opArgmin &&
reductionOp != ElementWiseOperator::opArgmax)
InvalidArgument("TensorOp: Arg reduction operations other than opArgmax, and opArgmin are not implemented.");
if (o.GetNumElements() == 1)
{
o.Data()[0] = (ElemType) a.ArgOp(reductionOp);
}
else
{
const size_t N = 2;
array<ElemType*, N> pointers = { a.Data(), o.Data() };
for (size_t i = 0; i < N; i++)
pointers[i] += offsets[i];
switch (regularOpDims.size())
{
case 2:
TensorArgOpIteration<ElemType, N, 1>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
break;
case 1:
TensorArgOpIteration<ElemType, N, 0>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
break;
case 0:
TensorArgOpIteration<ElemType, N, -1>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
break;
default:
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)regularOpDims.size());
}
}
}
}}}

Просмотреть файл

@ -20,6 +20,7 @@
#ifdef LEAKDETECT
#include <vld.h>
#endif
#include "half.hpp"
#pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
@ -480,7 +481,7 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::DoScatterColumnsOf(ElemTyp
// TODO: Replace with std::exclusive_scan when we switch to C++17
for (size_t i = 1; i <= GetNumCols(); ++i)
SecondaryIndexLocation()[i] = SecondaryIndexLocation()[i - 1] + columnElementCounts[i - 1];
size_t offset = a.SecondaryIndexLocation()[0];
// TODO: Does it make sense to parallelize this?
for (long j = 0; j < numColsToWrite; j++)
@ -531,7 +532,7 @@ void CPUSparseMatrix<ElemType>::Print(const char* matrixName, ptrdiff_t /*rowSta
fprintf(stderr, "\n");
j++;
}
fprintf(stderr, "%d:%.f ", unCompressedIndex[i], dataBuffer[i]);
fprintf(stderr, "%d:%.f ", unCompressedIndex[i], (double)dataBuffer[i]);
}
fprintf(stderr, "\n");
}
@ -721,7 +722,7 @@ void CPUSparseMatrix<ElemType>::SetMatrixFromSBCFormat(const size_t* blockIds, c
template <class ElemType>
ElemType* CPUSparseMatrix<ElemType>::Data() const
{
return (Buffer() +
return (Buffer() +
((GetFormat() == matrixFormatSparseCSC || GetFormat() == matrixFormatSparseCSR) ? GetCompIndex()[m_sliceViewOffset] : 0));
}
@ -810,7 +811,7 @@ template <class ElemType>
void CPUSparseMatrix<ElemType>::RequireSizeAndAllocate(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/, bool keepExistingValues /*= true*/)
{
RequireSize(numRows, numCols, numNZElemToReserve, matrixFormat, growOnly);
size_t newCompIndexSize = (numCols > numRows ? numCols : numRows) + 1;
bool reallocate = (GetSizeAllocated() < numNZElemToReserve || (GetSizeAllocated() > numNZElemToReserve && !growOnly) || GetCompIndexSize() < newCompIndexSize);
@ -964,7 +965,7 @@ public:
else if ( denseTimesSparse && transposeA) denseVal = dense( innerIndex, outerIndexDense);
else if (!denseTimesSparse && !transposeB) denseVal = dense( innerIndex, outerIndexDense);
else if (!denseTimesSparse && transposeB) denseVal = dense(outerIndexDense, innerIndex);
// Update matrix c.
if (denseTimesSparse)
@ -1312,7 +1313,7 @@ void CPUSparseMatrix<ElemType>::InnerProduct(const CPUSparseMatrix<ElemType>& a,
}
// A helper method used in MomentumSGDUpdate and NesterovAcceleratedMomentumSGDUpdate.
// Modifies the smoothed gradients "c", as well as the current gradients "this" on which this method is invoked.
// Modifies the smoothed gradients "c", as well as the current gradients "this" on which this method is invoked.
// Classic momentum (unitGainFactor == 1.0):
// 1) c = momentum * c + this
// Unit-gain momentum (unitGainFactor == 1.0 - momentum):
@ -1423,7 +1424,8 @@ ElemType CPUSparseMatrix<ElemType>::Adagrad(CPUMatrix<ElemType>& c, const bool n
}
template <class ElemType>
void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon, int* timestamps, int currentTimestamp)
template <class AccumType>
void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<AccumType>& c, CPUMatrix<AccumType>& functionValues, AccumType learningRate, AccumType rho, AccumType epsilon, int* timestamps, int currentTimestamp)
{
size_t numColsNeeded = 2 * GetNumCols();
@ -1441,9 +1443,9 @@ void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemT
size_t n = GetNumElements();
ElemType* grad = Data();
ElemType* smoothAda = c.Data();
ElemType* smoothX2 = c.Data() + n;
ElemType* val = functionValues.Data();
AccumType* smoothAda = c.Data();
AccumType* smoothX2 = c.Data() + n;
AccumType* val = functionValues.Data();
auto rows = GetNumRows();
#pragma omp parallel for
@ -1459,10 +1461,10 @@ void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemT
{
size_t denseIndex = columnOffset + row;;
ElemType g = grad[blockOffset + row];
ElemType adaSqr = rho * decay * smoothAda[denseIndex] + (1 - rho) * g * g;
AccumType adaSqr = rho * decay * smoothAda[denseIndex] + (1 - rho) * g * g;
smoothAda[denseIndex] = adaSqr;
ElemType x2 = decay * smoothX2[denseIndex];
ElemType deltaX = -sqrt(x2 + epsilon) / sqrt(adaSqr + epsilon) * g;
AccumType x2 = decay * smoothX2[denseIndex];
AccumType deltaX = -sqrt(x2 + epsilon) / sqrt(adaSqr + epsilon) * g;
smoothX2[denseIndex] = rho * x2 + (1 - rho) * deltaX * deltaX;
val[denseIndex] += learningRate * deltaX;
}
@ -1708,6 +1710,18 @@ ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
return sum;
}
// specialization to RunTimeError for now due to omp implementation only support build-in type
template <>
half CPUSparseMatrix<half>::FrobeniusNorm() const
{
RuntimeError("half FrobeniusNorm not supported.");
}
template <>
half CPUSparseMatrix<half>::SumOfElements() const
{
RuntimeError("half SumOfElements not supported.");
}
template <typename ElemType>
MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
{
@ -1763,8 +1777,54 @@ MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
template MATH_API File& operator>>(File& stream, CPUSparseMatrix<float>& us);
template MATH_API File& operator>>(File& stream, CPUSparseMatrix<double>& us);
template <typename ElemType>
MATH_API File& operator<<(File& stream, const CPUSparseMatrix<ElemType>& us)
{
if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR)
NOT_IMPLEMENTED;
stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
stream << sizeof(ElemType);
stream << std::wstring(L"nnmatrix"); // Note this is needed for compatability, and could potentially be an empty string
size_t nz, numRows, numCols;
size_t compressedSize = us.SecondaryIndexCount();
int format = us.GetFormat();
stream << format << nz << numCols << numRows;
if (nz > 0)
{
ElemType* dataBuffer = us.NzValues();
CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation();
CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation();
for (size_t i = 0; i < nz; ++i)
{
stream << dataBuffer[i];
}
for (size_t i = 0; i < nz; ++i)
{
stream << unCompressedIndex[i];
}
for (size_t i = 0; i < compressedSize; ++i)
{
stream << compressedIndex[i];
}
}
stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
return stream;
}
template class CPUSparseMatrix<float>;
template class CPUSparseMatrix<double>;
template class CPUSparseMatrix<half>;
// instantiate learner methods
template void CPUSparseMatrix<float>::AdaDelta(CPUMatrix<float>& c, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon, int* timestamps, int currentTimestamp);
template void CPUSparseMatrix<double>::AdaDelta(CPUMatrix<double>& c, CPUMatrix<double>& functionValues, double learningRate, double rho, double epsilon, int* timestamps, int currentTimestamp);
template void CPUSparseMatrix<half>::AdaDelta(CPUMatrix<float>& c, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon, int* timestamps, int currentTimestamp);
// We use Matrix<char> as the backing store for QuantizedMatrix
// Let's explciitly instantiate the methods we need for that purpose

Просмотреть файл

@ -232,7 +232,9 @@ public:
public:
void NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum, ElemType unitGainFactor);
ElemType Adagrad(CPUMatrix<ElemType>& c, const bool needAveMultiplier);
void AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon, int* timestamps, int currentTimestamp);
template<typename AccumType>
void AdaDelta(CPUMatrix<AccumType>& c, CPUMatrix<AccumType>& functionValues, AccumType learningRate, AccumType rho, AccumType epsilon, int* timestamps, int currentTimestamp);
public:
CPUSparseMatrix<ElemType>& InplaceTruncateTop(const ElemType threshold);

Просмотреть файл

@ -1,23 +1,11 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4100) // 'identifier': unreferenced formal parameter
#pragma warning(disable : 4127) // conditional expression is constant
#pragma warning(disable : 4201) // nonstandard extension used: nameless struct/union
#pragma warning(disable : 4458) // declaration of 'identifier' hides class member
#pragma warning(disable : 4515) // 'namespace': namespace uses itself
#endif
#include <cub/cub.cuh>
#ifdef _MSC_VER
#pragma warning(pop)
#endif
namespace Microsoft { namespace MSR { namespace CNTK {
size_t RoundUpToMultiple(size_t n, size_t blockSize)
@ -40,7 +28,7 @@ cudaError_t GetLastCudaError()
#endif
return cudaSuccess;
}
/*
template <int U, typename T>
__device__ __forceinline__ void LoadValues(const T* src, T dst[U])
{
@ -48,9 +36,17 @@ __device__ __forceinline__ void LoadValues(const T* src, T dst[U])
for (int i = 0; i < U; i++)
dst[i] = src[i];
}
*/
template <int U, typename T1, typename T2>
__device__ __forceinline__ void LoadValues(const T1* src, T2 dst[U])
{
#pragma unroll
for (int i = 0; i < U; i++)
dst[i] = (T2)src[i];
}
template <>
__device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst[2])
__device__ __forceinline__ void LoadValues<2, float, float>(const float* src, float dst[2])
{
// src must be aligned at 8 bytes boundary.
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
@ -60,7 +56,7 @@ __device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst
}
template <>
__device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst[4])
__device__ __forceinline__ void LoadValues<4, float, float>(const float* src, float dst[4])
{
// src must be aligned at 16 bytes boundary.
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
@ -73,7 +69,7 @@ __device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst
dst[2] = v.z;
dst[3] = v.w;
}
/*
template <int U, typename T>
__device__ __forceinline__ void StoreValues(const T src[U], T* dst)
{
@ -81,9 +77,17 @@ __device__ __forceinline__ void StoreValues(const T src[U], T* dst)
for (int i = 0; i < U; i++)
dst[i] = src[i];
}
*/
template <int U, typename T1, typename T2>
__device__ __forceinline__ void StoreValues(const T1 src[U], T2* dst)
{
#pragma unroll
for (int i = 0; i < U; i++)
dst[i] = (T2)src[i];
}
template <>
__device__ __forceinline__ void StoreValues<2, float>(const float src[2], float* dst)
__device__ __forceinline__ void StoreValues<2, float, float>(const float src[2], float* dst)
{
// dst must be aligned at 8 bytes boundary.
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
@ -94,7 +98,7 @@ __device__ __forceinline__ void StoreValues<2, float>(const float src[2], float*
}
template <>
__device__ __forceinline__ void StoreValues<4, float>(const float src[4], float* dst)
__device__ __forceinline__ void StoreValues<4, float, float>(const float src[4], float* dst)
{
// dst must be aligned at 16 bytes boundary.
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
@ -107,12 +111,16 @@ __device__ __forceinline__ void StoreValues<4, float>(const float src[4], float*
}
template <typename T>
__device__ __forceinline__ T Shuffle(T input, int srcLane)
__device__ __forceinline__ T Shuffle(T input, int srcLane, unsigned int mask)
{
#ifdef __CUDA_ARCH__
// shfl is supported only on Kepler+
static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer.");
#if CUDA_VERSION >= 9000
return cub::ShuffleIndex(input, srcLane, CUB_PTX_WARP_THREADS, mask); // Need cub > 1.7.0
#else
return cub::ShuffleIndex(input, srcLane);
#endif
#else
assert(false);
return input; // keep compiler happy
@ -136,6 +144,15 @@ namespace Operations
assert(::isfinite(a) && a > 0);
return rsqrt(a);
}
__device__ half RSqrt(half a)
{
#if __CUDA_ARCH__ >= 600
return hrsqrt(a);
#else
return __float2half(rsqrtf(__half2float(a)));
#endif
}
}
// This function is used to select correct unroll factor.
@ -151,6 +168,17 @@ void Call(size_t vectorSize, Targs... args)
Func<1>::template Call<T>(args...);
}
template <template <int> class Func, typename T1, typename T2, typename ...Targs>
void Call2(size_t vectorSize, Targs... args)
{
if ((vectorSize % 4) == 0)
Func<4>::template Call<T1, T2>(args...);
else if ((vectorSize % 2) == 0)
Func<2>::template Call<T1, T2>(args...);
else
Func<1>::template Call<T1, T2>(args...);
}
//--------------------------------------------------------------------
// Mean and variance computation
//--------------------------------------------------------------------
@ -186,15 +214,16 @@ void Call(size_t vectorSize, Targs... args)
// 0 < blendFactor < 1 - blend new running mean/var with averaged mean/var of the current minibatch, e.g.,
// new xMean = (1 - blendFactor) * actual batch mean + blendFactor * new runMean
// blendFactor == 0 - use actual batch mean/var
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
const ElemType* x, // (in) input data
double expAvgFactor, // TODO why not ElemType? same for the other parameters, functions?
double blendFactor,
ElemType* runMean, ElemType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
StatType* runMean, StatType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
double epsilon,
ElemType* xMean, ElemType* xInvStdDev) // (out) this minibatch's mean and inverse stddev
StatType* xMean, StatType* xInvStdDev) // (out) this minibatch's mean and inverse stddev
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert((vectorSize % U) == 0);
@ -219,9 +248,9 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
// first estimate mean over all data for this thread
int n = 0;
ElemType mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
ElemType m2[U]; // likewise for variance
ElemType im2[U]; // and inverse stddev
comp_t mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
comp_t m2[U]; // likewise for variance
comp_t im2[U]; // and inverse stddev
#pragma unroll
for (int k = 0; k < U; k++)
{
@ -235,13 +264,13 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
for (; icolSrc < batchSize; icolSrc += BlockDimY)
{
n++;
ElemType curVal[U];
comp_t curVal[U];
LoadValues<U>(psrc, curVal);
// No need for separate unrolling, SASS looks good.
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType d = curVal[k] - mean[k];
comp_t d = curVal[k] - mean[k];
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
// Using precise math slows down the code by about 40%.
mean[k] += d / n; // mean_n = [mean_{n-1} * (n-1) + curVal] / n = mean_{n-1} *n/n - mean_{n-1} / n + curVal / n
@ -253,6 +282,12 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
// now reduce minibatch mean/variance across threads
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
const int laneId = tid & 0x1f;
unsigned int mask;
#if CUDA_VERSION >= 9000
mask = __ballot_sync(0xffffffff, n);
#endif
// First, reduce within warp using shuffle.
if (n > 0)
{
@ -260,16 +295,16 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
for (int i = 1; i < CUB_PTX_WARP_THREADS / BlockDimX; i *= 2)
{
int srcLane = laneId + BlockDimX * i;
int n2 = Shuffle(n, srcLane);
int n2 = Shuffle(n, srcLane, mask);
int nsum = n + n2;
ElemType d[U];
comp_t d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = Shuffle(mean[k], srcLane) - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
d[k] = Shuffle(mean[k], srcLane, mask) - mean[k];
comp_t dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
m2[k] += Shuffle(m2[k], srcLane, mask) + d[k] * n * dScaled;
}
n = nsum;
}
@ -278,8 +313,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
// Storage for each warp in a thread block. First warp ("accumulator") holds
// final results so it does not need shared memory.
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
__shared__ ElemType meanRes[BlockDimX * U][cwarp - 1];
__shared__ ElemType m2Res[BlockDimX * U][cwarp - 1];
__shared__ comp_t meanRes[BlockDimX * U][cwarp - 1];
__shared__ comp_t m2Res[BlockDimX * U][cwarp - 1];
__shared__ int nRes[cwarp - 1];
// Each warp (except warp0) will write accumulated results to shared memory.
@ -309,12 +344,12 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
{
int n2 = nRes[i];
int nsum = n + n2;
ElemType d[U];
comp_t d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = meanRes[threadIdx.x * U + k][i] - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
comp_t dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += m2Res[threadIdx.x * U + k][i] + d[k] * n * dScaled;
}
@ -322,8 +357,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
}
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
ElemType run[U];
ElemType x[U];
comp_t run[U];
comp_t x[U];
// Compute running mean and batch mean.
LoadValues<U>(runMean + idxDstBase, run);
@ -344,14 +379,14 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
for (int k = 0; k < U; k++)
{
// Compute batch inverse standard deviation and variance
ElemType runVariance = batchSize == 1 ? 0 : m2[k] / (batchSize - 1);
comp_t runVariance = batchSize == 1 ? 0 : m2[k] / (batchSize - 1);
// Average
run[k] = expAvgFactor * runVariance + (1.0 - expAvgFactor) * run[k];
// Blend
im2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
im2[k] = Operations::RSqrt(static_cast<comp_t>(m2[k] / batchSize + epsilon));
if (blendFactor != 0)
{
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
comp_t runInvStdDev = Operations::RSqrt(static_cast<comp_t>(run[k] + epsilon));
im2[k] = blendFactor * runInvStdDev + (1.0 - blendFactor) * im2[k];
}
}
@ -363,7 +398,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
else if (threadIdx.y == 0)
{
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
ElemType run[U];
comp_t run[U];
// Copy mean
LoadValues<U>(runMean + idxDstBase, run);
@ -373,7 +408,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
LoadValues<U>(runVariance + idxDstBase, run);
#pragma unroll
for (int k = 0; k < U; k++)
run[k] = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
run[k] = Operations::RSqrt(static_cast<comp_t>(run[k] + epsilon));
StoreValues<U>(run, xInvStdDev + idxDstBase);
}
}
@ -381,12 +416,13 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
// This kernel is very similar to kComputeBatchMeanAndInvStdDev except it reduces not just over N (minibatch)
// but also W and H dimensions.
// REVIEW alexeyk: is it possible to combine this and previous kernel into a single kernel without hurting performance/readability much?
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
__global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatialSize, int batchSize, const ElemType* x,
double expAvgFactor, double blendFactor,
ElemType* runMean, ElemType* runVariance,
double epsilon, ElemType* xMean, ElemType* xInvStdDev)
StatType* runMean, StatType* runVariance,
double epsilon, StatType* xMean, StatType* xInvStdDev)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
@ -410,8 +446,8 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
int irowSrcLim = (blockIdx.x + 1) * spatialSize;
int n = 0;
ElemType mean[U];
ElemType m2[U];
comp_t mean[U];
comp_t m2[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
@ -429,13 +465,13 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
for (int irowSrc = irowSrcBase; irowSrc < irowSrcLim; irowSrc += BlockDimX * U, psrc += BlockDimX * U)
{
n++;
ElemType curVal[U];
comp_t curVal[U];
LoadValues<U>(psrc, curVal);
// No need for separate unrolling, SASS looks good.
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType d = curVal[k] - mean[k];
comp_t d = curVal[k] - mean[k];
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
// Using precise math slows down the code by about 40%.
mean[k] += d / n;
@ -448,22 +484,26 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
const int laneId = tid & 0x1f;
// First, reduce within warp using shuffle.
unsigned int mask;
#if CUDA_VERSION >= 9000
mask = __ballot_sync(0xffffffff, n);
#endif
if (n > 0)
{
#pragma unroll
for (int i = 1; i < CUB_PTX_WARP_THREADS; i *= 2)
{
int srcLane = laneId + i;
int n2 = Shuffle(n, srcLane);
int n2 = Shuffle(n, srcLane, mask);
int nsum = n + n2;
ElemType d[U];
comp_t d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = Shuffle(mean[k], srcLane) - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
d[k] = Shuffle(mean[k], srcLane, mask) - mean[k];
comp_t dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
m2[k] += Shuffle(m2[k], srcLane, mask) + d[k] * n * dScaled;
}
n = nsum;
}
@ -472,8 +512,8 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
// Storage for each warp in a thread block. First warp ("accumulator") holds
// final results so it does not need shared memory.
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
__shared__ ElemType meanRes[U][cwarp - 1];
__shared__ ElemType m2Res[U][cwarp - 1];
__shared__ comp_t meanRes[U][cwarp - 1];
__shared__ comp_t m2Res[U][cwarp - 1];
__shared__ int nRes[cwarp - 1];
// Each warp (except warp0) will write accumulated results to shared memory.
@ -499,12 +539,12 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
{
int n2 = nRes[i];
int nsum = n + n2;
ElemType d[U];
comp_t d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = meanRes[k][i] - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
comp_t dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += m2Res[k][i] + d[k] * n * dScaled;
}
@ -515,8 +555,8 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
#pragma unroll
for (int k = 1; k < U; k++)
{
ElemType d = mean[k] - mean[0];
ElemType dScaled = d * n / (n + k * n);
comp_t d = mean[k] - mean[0];
comp_t dScaled = d * n / (n + k * n);
mean[0] += dScaled;
m2[0] += m2[k] + d * k * n * dScaled;
}
@ -525,12 +565,12 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
runMean[blockIdx.x] = expAvgFactor * mean[0] + (1.0 - expAvgFactor) * runMean[blockIdx.x];
xMean[blockIdx.x] = blendFactor * runMean[blockIdx.x] + (1.0 - blendFactor) * mean[0];
ElemType runV = batchSize * spatialSize == 1 ? 0 : m2[0] / (batchSize * spatialSize - 1);
comp_t runV = batchSize * spatialSize == 1 ? 0 : m2[0] / (batchSize * spatialSize - 1);
runVariance[blockIdx.x] = expAvgFactor * runV + (1.0 - expAvgFactor) * runVariance[blockIdx.x];
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<comp_t>(m2[0] / (batchSize * spatialSize) + (comp_t)epsilon));
if (blendFactor != 0)
{
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
comp_t runInvStdDev = Operations::RSqrt(static_cast<comp_t>((comp_t)runVariance[blockIdx.x] + (comp_t)epsilon));
xInvStdDev[blockIdx.x] = blendFactor * runInvStdDev + (1.0 - blendFactor) * xInvStdDev[blockIdx.x];
}
}
@ -538,7 +578,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
else if (threadIdx.y == 0 && threadIdx.x == 0)
{
xMean[blockIdx.x] = runMean[blockIdx.x];
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<comp_t>((comp_t)runVariance[blockIdx.x] + (comp_t)epsilon));
}
}
@ -547,14 +587,14 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
template <int U>
struct ComputeBatchMeanAndInvStdDev
{
template <typename ElemType>
template <typename ElemType, typename StatType>
static void Call(size_t vectorSize, size_t batchSize,
const ElemType* x, // (in) input data
double expAvgFactor,
double blendFactor,
ElemType* runMean, ElemType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
StatType* runMean, StatType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
double epsilon,
ElemType* xMean, ElemType* xInvStdDev, // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
StatType* xMean, StatType* xInvStdDev, // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
cudaStream_t stream)
{
assert((vectorSize % U) == 0);
@ -565,7 +605,7 @@ struct ComputeBatchMeanAndInvStdDev
auto bdim = dim3(BlockDimX, BlockDimY);
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(batchSize),
x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
}
@ -574,10 +614,10 @@ struct ComputeBatchMeanAndInvStdDev
template <int U>
struct ComputeSpatialBatchMeanAndInvStdDev
{
template <typename ElemType>
template <typename ElemType, typename StatType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x,
double expAvgFactor, double blendFactor, ElemType* runMean, ElemType* runVariance,
double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
double expAvgFactor, double blendFactor, StatType* runMean, StatType* runVariance,
double epsilon, StatType* xMean, StatType* xInvStdDev, cudaStream_t stream)
{
assert((vectorSize % spatialSize) == 0);
assert((spatialSize % U) == 0);
@ -589,7 +629,7 @@ struct ComputeSpatialBatchMeanAndInvStdDev
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
// Each thread block processes a single whole feature map independently (i.e. reduces over W, H and N dimensions).
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize),
x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
}
@ -603,14 +643,15 @@ struct ComputeSpatialBatchMeanAndInvStdDev
// or Cx1x1 in convolutional case.
//--------------------------------------------------------------------
template <int BlockDimX, int BlockDimY, bool Spatial, bool NormalizeRunningStats, int U, typename ElemType>
template <int BlockDimX, int BlockDimY, bool Spatial, bool NormalizeRunningStats, int U, typename ElemType, typename StatType>
__global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int batchSize,
double epsilon,
const ElemType* x, ElemType* y,
const ElemType* bnScale, const ElemType* bnBias,
const ElemType* runningMean, const ElemType* runningVariance,
const ElemType* batchMean, ElemType* batchInvStdDev)
const StatType* bnScale, const StatType* bnBias,
const StatType* runningMean, const StatType* runningVariance,
const StatType* batchMean, StatType* batchInvStdDev)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
@ -627,10 +668,10 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
return;
assert(irowBase + U <= vectorSize);
__shared__ ElemType meanS[BlockDimX * U];
__shared__ ElemType invStdDevS[BlockDimX * U];
__shared__ ElemType scaleS[BlockDimX * U];
__shared__ ElemType biasS[BlockDimX * U];
__shared__ comp_t meanS[BlockDimX * U];
__shared__ comp_t invStdDevS[BlockDimX * U];
__shared__ comp_t scaleS[BlockDimX * U];
__shared__ comp_t biasS[BlockDimX * U];
int offs = threadIdx.x * U;
// REVIEW alexeyk: optimize smem usage, reduce transaction count (is it worth it?).
@ -644,8 +685,8 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
int imap = (irowBase + k) / spatialSize;
meanS[offs + k] = NormalizeRunningStats ? runningMean[imap] : batchMean[imap];
invStdDevS[offs + k] = NormalizeRunningStats
? Operations::RSqrt(static_cast<ElemType>(runningVariance[imap] + epsilon))
: batchInvStdDev[imap];
? Operations::RSqrt(static_cast<comp_t>((comp_t)runningVariance[imap] + (comp_t)epsilon))
: (comp_t)batchInvStdDev[imap];
scaleS[offs + k] = bnScale[imap];
biasS[offs + k] = bnBias[imap];
}
@ -657,18 +698,18 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
for (int k = 0; k < U; k++)
{
invStdDevS[offs + k] = NormalizeRunningStats
? Operations::RSqrt(static_cast<ElemType>(runningVariance[irowBase + k] + epsilon))
: batchInvStdDev[irowBase + k];
? Operations::RSqrt(static_cast<comp_t>((comp_t)runningVariance[irowBase + k] + (comp_t)epsilon))
: (comp_t)batchInvStdDev[irowBase + k];
}
LoadValues<U>(bnScale + irowBase, scaleS + offs);
LoadValues<U>(bnBias + irowBase, biasS + offs);
}
}
__syncthreads();
ElemType mean[U];
ElemType invStdDev[U];
ElemType scale[U];
ElemType bias[U];
comp_t mean[U];
comp_t invStdDev[U];
comp_t scale[U];
comp_t bias[U];
LoadValues<U>(meanS + offs, mean);
LoadValues<U>(invStdDevS + offs, invStdDev);
LoadValues<U>(scaleS + offs, scale);
@ -681,7 +722,7 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
for (; icol < batchSize; icol += gridDim.y * BlockDimY, psrc += stride, pdst += stride)
{
ElemType val[U];
comp_t val[U];
LoadValues<U>(psrc, val);
#pragma unroll
for (int k = 0; k < U; k++)
@ -695,13 +736,13 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
template <int U>
struct NormalizeBatchTraining
{
template <typename ElemType>
template <typename ElemType, typename StatType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial,
bool normalizeRunningStats, double epsilon,
const ElemType* x, ElemType* y, // (in, out) data to normalize -> normalized data
const ElemType* bnScale, const ElemType* bnBias, // (in) scale/bias to denormalize with
const ElemType* runningMean, const ElemType* runningVariance, // (in) running mean/variance
const ElemType* batchMean, ElemType* batchInvStdDev, // (in) batch mean/stddev to normalize with
const StatType* bnScale, const StatType* bnBias, // (in) scale/bias to denormalize with
const StatType* runningMean, const StatType* runningVariance, // (in) running mean/variance
const StatType* batchMean, StatType* batchInvStdDev, // (in) batch mean/stddev to normalize with
cudaStream_t stream)
{
assert((vectorSize % U) == 0);
@ -715,14 +756,14 @@ struct NormalizeBatchTraining
if (spatial)
{
if (normalizeRunningStats)
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, true, U><<<gdim, bdim, 0, stream>>>(
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, true, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
runningMean, runningVariance,
batchMean, batchInvStdDev);
else
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, false, U><<<gdim, bdim, 0, stream>>>(
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, false, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
@ -732,14 +773,14 @@ struct NormalizeBatchTraining
else
{
if (normalizeRunningStats)
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, true, U><<<gdim, bdim, 0, stream>>>(
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, true, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
runningMean, runningVariance,
batchMean, batchInvStdDev);
else
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, false, U><<<gdim, bdim, 0, stream>>>(
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, false, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
@ -757,10 +798,11 @@ struct NormalizeBatchTraining
// All tensor dimensions and assumptions are the same as in case of forward propagation.
//--------------------------------------------------------------------
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
const ElemType* savedMean, const ElemType* savedInvStdDev)
template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, StatType* dScale, StatType* dBias,
const StatType* savedMean, const StatType* savedInvStdDev)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
static_assert(((BlockDimY - 1) & BlockDimY) == 0, "BlockDimY must be a power of 2.");
@ -777,10 +819,10 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
return;
assert(irowSrcBase + U <= vectorSize);
ElemType mean[U];
ElemType invStdDev[U];
__shared__ ElemType meanS[BlockDimX * U];
__shared__ ElemType invStdDevS[BlockDimX * U];
comp_t mean[U];
comp_t invStdDev[U];
__shared__ comp_t meanS[BlockDimX * U];
__shared__ comp_t invStdDevS[BlockDimX * U];
// Read mean and inv std dev.
if (threadIdx.y == 0)
{
@ -796,8 +838,8 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
LoadValues<U>(&invStdDevS[threadIdx.x * U], invStdDev);
}
ElemType ds[U];
ElemType db[U];
comp_t ds[U];
comp_t db[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
@ -813,21 +855,21 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY, px += stride, pdy += stride)
{
ElemType curX[U];
ElemType curdY[U];
comp_t curX[U];
comp_t curdY[U];
LoadValues<U>(px, curX);
LoadValues<U>(pdy, curdY);
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] += pdy[k] * (curX[k] - mean[k]) * invStdDev[k];
db[k] += pdy[k];
ds[k] += (comp_t)pdy[k] * (curX[k] - mean[k]) * invStdDev[k];
db[k] += (comp_t)pdy[k];
}
}
// Final reduction.
__shared__ ElemType dsS[BlockDimY][BlockDimX * U];
__shared__ ElemType dbS[BlockDimY][BlockDimX * U];
__shared__ comp_t dsS[BlockDimY][BlockDimX * U];
__shared__ comp_t dbS[BlockDimY][BlockDimX * U];
StoreValues<U>(ds, &dsS[threadIdx.y][threadIdx.x * U]);
StoreValues<U>(db, &dbS[threadIdx.y][threadIdx.x * U]);
__syncthreads();
@ -861,10 +903,11 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
}
}
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
__global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev)
StatType* dScale, StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
@ -881,10 +924,10 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
assert(irowBase + U <= vectorSize);
int irowLim = (blockIdx.x + 1) * spatialSize;
ElemType mean;
ElemType invStdDev;
__shared__ ElemType meanS;
__shared__ ElemType invStdDevS;
comp_t mean;
comp_t invStdDev;
__shared__ comp_t meanS;
__shared__ comp_t invStdDevS;
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
// Read mean and inv std dev.
if (tid == 0)
@ -899,8 +942,8 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
invStdDev = invStdDevS;
}
ElemType ds[U];
ElemType db[U];
comp_t ds[U];
comp_t db[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
@ -921,25 +964,25 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
// Stride over all values in feature map (W and H dimensions).
for (int irow = irowBase; irow < irowLim; irow += BlockDimX * U, px += BlockDimX * U, pdy += BlockDimX * U)
{
ElemType curX[U];
ElemType curdY[U];
comp_t curX[U];
comp_t curdY[U];
LoadValues<U>(px, curX);
LoadValues<U>(pdy, curdY);
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] += pdy[k] * (curX[k] - mean) * invStdDev;
db[k] += pdy[k];
ds[k] += (comp_t)pdy[k] * (curX[k] - mean) * invStdDev;
db[k] += (comp_t)pdy[k];
}
}
}
__syncthreads();
using BlockReduce = cub::BlockReduce<ElemType, BlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BlockDimY>;
using BlockReduce = cub::BlockReduce<comp_t, BlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BlockDimY>;
// Note: must use separate temp storages for each reduction.
__shared__ typename BlockReduce::TempStorage tmp1;
ElemType dsRes = BlockReduce(tmp1).Sum(ds);
comp_t dsRes = BlockReduce(tmp1).Sum(ds);
__shared__ typename BlockReduce::TempStorage tmp2;
ElemType dbRes = BlockReduce(tmp2).Sum(db);
comp_t dbRes = BlockReduce(tmp2).Sum(db);
if (tid == 0)
{
dScale[blockIdx.x] = dsRes;
@ -950,9 +993,9 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
template <int U>
struct ComputeScaleAndBiasGradients
{
template <typename ElemType>
template <typename ElemType, typename StatType>
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
StatType* dScale, StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
assert(batchSize >= 1);
@ -961,7 +1004,7 @@ struct ComputeScaleAndBiasGradients
auto bdim = dim3(BlockDimX, BlockDimY);
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
}
};
@ -969,9 +1012,9 @@ struct ComputeScaleAndBiasGradients
template <int U>
struct ComputeSpatialScaleAndBiasGradients
{
template <typename ElemType>
template <typename ElemType, typename StatType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
StatType* dScale, StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev, cudaStream_t stream)
{
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
@ -982,17 +1025,18 @@ struct ComputeSpatialScaleAndBiasGradients
auto bdim = dim3(BlockDimX, BlockDimY);
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
}
};
// mbStatsWeight is the weight with which current MB's stats were used (0 means not at all, locked model).
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType, typename StatType>
__global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
const ElemType* savedMean, const ElemType* savedInvStdDev)
const StatType* bnScale, StatType mbStatsWeight, const StatType* dScale, const StatType* dBias,
const StatType* savedMean, const StatType* savedInvStdDev)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
@ -1008,11 +1052,11 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
if (irowBase >= vectorSize)
return;
assert(irowBase + U <= vectorSize);
ElemType scale[U];
ElemType ds[U];
ElemType db[U];
ElemType mean[U];
ElemType invStdDev[U];
comp_t scale[U];
comp_t ds[U];
comp_t db[U];
comp_t mean[U];
comp_t invStdDev[U];
// REVIEW alexeyk: here we're wasting some bandwidth but this might be ok as it's a one-timer.
if (Spatial)
{
@ -1044,9 +1088,9 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
for (; icol < batchSize; icol += gridDim.y * BlockDimY, px += stride, pdy += stride, pdx += stride)
{
ElemType xCur[U];
ElemType dyCur[U];
ElemType dxCur[U];
comp_t xCur[U];
comp_t dyCur[U];
comp_t dxCur[U];
LoadValues<U>(px, xCur);
LoadValues<U>(pdy, dyCur);
LoadValues<U>(pdx, dxCur);
@ -1060,12 +1104,12 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
// dBias = Reduce(dy)
// dScale = Reduce(dy * xHat)
// Simplifying this a bit more, we get the formula below.
ElemType val[U];
comp_t val[U];
int m = Spatial ? batchSize * spatialSize : batchSize;
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k]; // xHat
comp_t xNorm = (xCur[k] - mean[k]) * invStdDev[k]; // xHat
// scale * invStdDev * (
// dL/dyi
// - mbStatsWeight * (xHat * dL/dScale + dL/dBias) / m
@ -1073,7 +1117,7 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
val[k] = dxCur[k] // (adding to gradient)
+ (scale[k] * invStdDev[k]) * (
dyCur[k]
- mbStatsWeight * (xNorm * ds[k] + db[k]) / m);
- (comp_t)mbStatsWeight * (xNorm * ds[k] + db[k]) / m);
}
StoreValues<U>(val, pdx);
}
@ -1082,10 +1126,10 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
template <int U>
struct BackpropagateBatchNormGradients
{
template <typename ElemType>
template <typename ElemType, typename StatType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
const ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
const StatType* bnScale, StatType mbStatsWeight, const StatType* dScale,
const StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
assert(batchSize >= 1);
@ -1096,7 +1140,7 @@ struct BackpropagateBatchNormGradients
static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
if (spatial)
{
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
}
else

Просмотреть файл

@ -51,6 +51,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
MATH_API void SetMathLibTraceLevel(int traceLevel);
MATH_API int GetMathLibTraceLevel();
inline bool IsGpu(DEVICEID_TYPE deviceId)
{
return deviceId > CPUDEVICE;
}
class MATH_API TracingGPUMemoryAllocator
{
private:

Просмотреть файл

@ -1,5 +1,6 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
@ -8,6 +9,7 @@
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <math_constants.h>
#include "half.hpp"
namespace Microsoft { namespace MSR { namespace CNTK {
@ -28,6 +30,7 @@ __global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__
const ElemType* __restrict__ src, int srcVecSize,
ElemType* dst, int dstVecSize)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= dstVecSize)
return;
@ -41,7 +44,7 @@ __global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__
int ivBase = mpRowIwht[row];
assert(0 <= colBase && colBase < srcVecSize);
ElemType sum = 0;
comp_t sum = 0;
int i0 = mpRowRun[row];
int skip = runs[i0++];
int size = runs[i0++];
@ -52,7 +55,7 @@ __global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__
continue;
int dcol = runs[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
sum += kernel[ivBase + skip + i] * src[colBase + dcol];
sum += (comp_t)kernel[ivBase + skip + i] * (comp_t)src[colBase + dcol];
}
dst[row] = sum;
@ -68,6 +71,7 @@ __global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restri
const ElemType* __restrict__ srcGrad, int srcVecSize,
ElemType* grad, int dstVecSize)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= srcVecSize)
return;
@ -81,7 +85,7 @@ __global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restri
int ivBase = mpRowIwht[row];
assert(0 <= colBase && colBase < dstVecSize);
ElemType g = srcGrad[row];
comp_t g = srcGrad[row];
int i0 = mpRowRun[row];
int skip = runs[i0++];
int size = runs[i0++];
@ -92,7 +96,7 @@ __global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restri
continue;
int dcol = runs[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
atomicAdd(&grad[colBase + dcol], g * kernel[ivBase + skip + i]);
atomicAdd(&grad[colBase + dcol], (ElemType)((comp_t)g * (comp_t)kernel[ivBase + skip + i]));
}
srcGrad += blockDim.y * srcVecSize;
@ -108,6 +112,7 @@ __global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int out
const ElemType* __restrict__ srcGrad,
ElemType* kernelGrad)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= outVecSize)
return;
@ -121,7 +126,7 @@ __global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int out
int ivBase = mpRowIwht[row];
assert(0 <= colBase && colBase < inVecSize);
ElemType g = srcGrad[row];
comp_t g = srcGrad[row];
int i0 = mpRowRun[row];
int skip = runs[i0++];
int size = runs[i0++];
@ -132,7 +137,7 @@ __global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int out
continue;
int dcol = runs[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < inVecSize);
atomicAdd(&kernelGrad[ivBase + skip + i], g * in[colBase + dcol]);
atomicAdd(&kernelGrad[ivBase + skip + i], (ElemType)((comp_t)g * (comp_t)in[colBase + dcol]));
}
in += blockDim.y * inVecSize;
@ -233,7 +238,7 @@ __device__ double round_(double a)
// corresponding to the ROI and which pixels in that subset should go into the
// output location, then takes the max value over that window.
// src: Images [W x H x C x N]
// roiData: ROIs [4 x numROIs x N],
// roiData: ROIs [4 x numROIs x N],
// dst: Pooled ROIs [PW x PH x C x numROIs x N]
// argmax: max positions [PW x PH x C x numROIs x N]
// spatialScale ratio of input feature map to the original image.
@ -245,6 +250,7 @@ __global__ void kMaxROIPoolingForward(const int totalIterations,
const int pooledWidth, const int pooledHeight, const ElemType* src,
const ElemType* roiData, ElemType* dst, ElemType* argmax, double spatialScale)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
// index loops over all totalRois*c*pooledHeight*pooledWidth output locations.
for (int index = blockIdx.x * blockDim.x + threadIdx.x;
index < (totalIterations); index += blockDim.x * gridDim.x)
@ -269,8 +275,8 @@ __global__ void kMaxROIPoolingForward(const int totalIterations,
int roiWidth = max(roiEndW - roiStartW + 1, (int)1);
int roiHeight = max(roiEndH - roiStartH + 1, (int)1);
ElemType winH = (ElemType)roiHeight / (ElemType)pooledHeight;
ElemType winW = (ElemType)roiWidth / (ElemType)pooledWidth;
comp_t winH = (comp_t)roiHeight / (comp_t)pooledHeight;
comp_t winW = (comp_t)roiWidth / (comp_t)pooledWidth;
// compute window for this output location.
int hstart = (int)(ph * winH);
@ -286,7 +292,7 @@ __global__ void kMaxROIPoolingForward(const int totalIterations,
bool isempty = (hend <= hstart) || (wend <= wstart);
// Define an empty pooling region to be zero
ElemType maxval = isempty ? (ElemType)0 : -CUDART_INF_F;
comp_t maxval = isempty ? (comp_t)0 : (comp_t)-CUDART_INF_F;
int maxidx = -1;
int imgIdx = n / numROIs;
@ -319,6 +325,7 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
const int pooledWidth, const int pooledHeight, const ElemType* pooledGrad,
const ElemType* roiData, ElemType* grad, const ElemType* argmax, double spatialScale)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
// index loops over all input locations (locations in the original input tensor).
for (int index = blockIdx.x * blockDim.x + threadIdx.x;
index < (totalIterations); index += blockDim.x * gridDim.x)
@ -334,7 +341,7 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
int roiMin = n * numROIs;
int roiMax = (n + 1) * numROIs;
ElemType gradient = 0;
comp_t gradient = 0;
for (int roiN = roiMin; roiN < roiMax; roiN++)
{
// each ROI is 4 elements: (x, y, w, h)
@ -355,8 +362,8 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
if (!inROI)
continue;
ElemType winH = (ElemType)roiHeight / (ElemType)pooledHeight;
ElemType winW = (ElemType)roiWidth / (ElemType)pooledWidth;
comp_t winH = (comp_t)roiHeight / (comp_t)pooledHeight;
comp_t winW = (comp_t)roiWidth / (comp_t)pooledWidth;
// what pooled nodes in the output for this ROI could have pooled this input location?
// we use int here since the computation can yield a negative result
@ -381,13 +388,13 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
{
if ((int)offsetArgmax[ph * pooledWidth + pw] == (h * width + w))
{
gradient += offsetPoolGrad[ph * pooledWidth + pw];
gradient += (comp_t)offsetPoolGrad[ph * pooledWidth + pw];
}
}
}
}
atomicAdd(&grad[index], gradient);
atomicAdd(&grad[index], (ElemType)gradient);
}
}
@ -443,6 +450,7 @@ __global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const
const ElemType* __restrict__ src, int srcVecSize,
ElemType* dst, int dstVecSize)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= dstVecSize)
return;
@ -457,14 +465,14 @@ __global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const
int i0 = mpRowIndices[row];
int size = indices[i0++];
ElemType sum = 0;
comp_t sum = 0;
for (int i = 0; i < size; i++)
{
int dcol = indices[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
sum += src[colBase + dcol];
sum += (comp_t)src[colBase + dcol];
}
dst[row] = sum / size;
dst[row] = sum / (comp_t)size;
src += blockDim.y * srcVecSize;
dst += blockDim.y * dstVecSize;

Просмотреть файл

@ -6,7 +6,7 @@
#include "stdafx.h"
#include "ConvolutionEngine.h"
#include "CuDnnFactories.h"
#include "Mkl2017DnnCommon.h"
#include "MklDnnCommon.h"
namespace Microsoft { namespace MSR { namespace CNTK {
@ -131,7 +131,7 @@ void ConvolutionEngine<ElemType>::MaxUnpooling(const Mat& out, const Mat& poolIn
//------------------------------------------------------------------
// Reference convolution engine implementation.
// This engine supports arbitrary convolution geometry but does not provide efficient implementation.
// Its main purpose is to serve as a baseline for optmized engines (e.g. cuDNN) that
// Its main purpose is to serve as a baseline for optmized engines (e.g. cuDNN) that
// usually implement only a subset of a general convolution geometry.
//------------------------------------------------------------------
template <class ElemType>
@ -169,11 +169,11 @@ protected:
if (m_mpRowIwht == nullptr)
{
auto flags = IsGpu(m_deviceId) ? matrixFlagNormal : matrixFlagDontOwnBuffer;
m_mpRowIwht = std::make_unique<Matrix<int>>(m_geometry->MpRowIwht().size(), 1,
m_mpRowIwht = std::make_unique<Matrix<int>>(m_geometry->MpRowIwht().size(), 1,
const_cast<int*>(m_geometry->MpRowIwht().data()), m_deviceId, flags);
m_mpRowRun = std::make_unique<Matrix<int>>(m_geometry->MpRowRun().size(), 1,
const_cast<int*>(m_geometry->MpRowRun().data()), m_deviceId, flags);
m_runs = std::make_unique<Matrix<int>>(m_geometry->Runs().size(), 1,
m_runs = std::make_unique<Matrix<int>>(m_geometry->Runs().size(), 1,
const_cast<int*>(m_geometry->Runs().data()), m_deviceId, flags);
}
}
@ -239,12 +239,6 @@ protected:
out.MaxUnpooling(m_mpRowCol, *m_mpRowIndices, *m_indices, poolIn, in);
}
protected:
static bool IsGpu(DEVICEID_TYPE deviceId)
{
return deviceId >= 0;
}
protected:
using IntMatPtr = std::unique_ptr<Matrix<int>>;
// IMP NOTE: Make sure that in the declaration below m_isConvGeometryComputed is declared
@ -273,7 +267,7 @@ public:
public:
LegacyConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind, bool poolIncludePad)
: Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, poolIncludePad),
: Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, poolIncludePad),
m_inT(m_geometry->InputShape(), ImageLayoutKind::CHW), m_outT(m_geometry->OutputShape(), ImageLayoutKind::CHW),
m_kernelT(m_geometry->KernelShape(), ImageLayoutKind::CHW), m_strideT(m_geometry->Stride(), ImageLayoutKind::CHW)
{
@ -559,7 +553,7 @@ private:
//------------------------------------------------------------------
// GEMM convolution engine implementation.
// This engine supports arbitrary convolution configuration with full
// sharing and implemented using unroll + GEMM technique
// sharing and implemented using unroll + GEMM technique
// (High performance convolutional neural networks for document processing; Chellapilla, Puri, Simard)
// Uses reference engine for pooling operations.
//------------------------------------------------------------------
@ -579,8 +573,6 @@ public:
protected:
using typename Base::IntMatPtr;
using Base::IsGpu;
using Base::m_geometry;
using Base::m_deviceId;
using Base::m_imageLayout;
@ -600,7 +592,7 @@ protected:
LogicError("GEMM convolution engine currently supports only CPU device.");
}
// A note on notation used in the documentation for the next 3 functions:
// A note on notation used in the documentation for the next 3 functions:
// for simplicity we use cuDNN-style notation for 2D convolutions (though this engine supports arbitrary convolution configuration)
// where N - is the number of samples in a batch, C, H, W are number of channels, height and width of the input respectively.
// For the output we use K as the number of output feature maps and H', W' as height and width of the output.
@ -632,7 +624,7 @@ protected:
size_t mapOutSize = m_geometry->OutputShape().GetNumElements() / mapCount;
size_t unrollRows = mapOutSize * subBatchSize;
size_t unrollCols = m_geometry->KernelShape().GetNumElements();
// Reserve space for unrolled inputs and, if needed, intermediate outputs.
// Reserve space for unrolled inputs and, if needed, intermediate outputs.
// Intermediate outputs will be transposed to final outputs after GEMM operation.
// Transpose is not required if subBatchSize == 1.
workspace.Resize(unrollRows, unrollCols + (subBatchSize > 1 ? mapCount : 0));
@ -682,7 +674,7 @@ protected:
}
}
}
// The backward data method works by representing this operation as a "reverse" convolution
// in case kernel's last dimension is equal to input dimension. Gradients matrix (grad) becomes
// an output of such reverse convolution.
@ -734,7 +726,7 @@ protected:
workspace.Resize(1, kernCols + unrollRows * (unrollCols + (subBatchSize > 1 ? mapInCount : 0)));
auto kern = kernel.ColumnSlice(0, kernel.GetNumCols());
size_t kernTCols = kernT.GetNumElements();
size_t kernTCols = kernT.GetNumElements();
// cudnn layout uses row-major kernel weight matrix.
kern.Reshape(kernTCols, kernCols/kernTCols);
// Now transpose and reshape to [KXY x C].
@ -862,7 +854,7 @@ protected:
// cudnn layout uses row-major kernel weight matrix.
auto kernGrad = kernelGrad.ColumnSlice(0, kernelGrad.GetNumCols());
kernGrad.Reshape(unrollRows, kernGrad.GetNumElements() / unrollRows);
kernGrad.Reshape(unrollRows, kernGrad.GetNumElements() / unrollRows);
// 3. Multiply.
Mat::MultiplyAndAdd(unrolledInputSlice, true, srcGradSlice, false, kernGrad);
}
@ -955,6 +947,11 @@ protected:
bool Supported(const ConvolveGeometry* geometry, bool forward)
{
//TODO: test code for linking with mkldnn.dll, will extend to support dilated convolution with MKL-DNN later
mkldnn_primitive_attr_t attr;
mkldnn_primitive_attr_create(&attr);
mkldnn_primitive_attr_destroy(attr);
//MKL2017 does not support asymmetric padding yet
if (geometry->IsAsymmetricPadding()) return false;
@ -1125,7 +1122,7 @@ public:
template <class ElemType>
std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
ConvolutionEngineKind enabledEngines, std::wstring logPrefix,
ConvolutionEngineKind enabledEngines, std::wstring logPrefix,
bool forceDeterministicAlgorithms, bool poolIncludePad,
bool inputHasFreeDimension)
{
@ -1134,7 +1131,7 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
auto isEnabled = [=](ConvolutionEngineKind eng) { return ((int)enabledEngines & (int)eng) != 0; };
// Note: in some cases do not throw exception even if parameters do not match as Create
// can be called from places like MEL with default parameters and never be used.
// can be called from places like MEL with default parameters and never be used.
// The check will be done later in engine's EnsureCompatible call if the egnine is actually used.
auto engStr = (std::string)(*geometry);
// Only legacy engine supports HWC layout.
@ -1156,7 +1153,7 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
if (GetMathLibTraceLevel() > 0)
fprintf(stderr, "%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind,
return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind,
forceDeterministicAlgorithms, poolIncludePad, inputHasFreeDimension);
}
@ -1177,7 +1174,41 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
return std::make_unique<ReferenceConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, poolIncludePad);
}
// only GPU supports fp16 convolution
template <>
std::unique_ptr<ConvolutionEngine<half>> ConvolutionEngine<half>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
ConvolutionEngineKind enabledEngines, std::wstring logPrefix,
bool forceDeterministicAlgorithms, bool poolIncludePad,
bool inputHasFreeDimension)
{
if (!logPrefix.empty())
logPrefix += L": ";
auto isEnabled = [=](ConvolutionEngineKind eng) { return ((int)enabledEngines & (int)eng) != 0; };
// Note: in some cases do not throw exception even if parameters do not match as Create
// can be called from places like MEL with default parameters and never be used.
// The check will be done later in engine's EnsureCompatible call if the egnine is actually used.
auto engStr = (std::string)(*geometry);
// Check if we can use cuDNN engine. Do not need to validate tensors as ConvolveGeometry has already done that.
if (isEnabled(ConvolutionEngineKind::CuDnn) &&
CuDnnConvolutionEngineFactory<half>::IsSupported(deviceId, geometry, poolKind))
{
if (GetMathLibTraceLevel() > 0)
fprintf(stderr, "%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
return CuDnnConvolutionEngineFactory<half>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind,
forceDeterministicAlgorithms, poolIncludePad, inputHasFreeDimension);
}
RuntimeError("FP16 convolution is only supported via cuDNN.");
return nullptr;
}
template class ConvolutionEngine<float>;
template class ConvolutionEngine<double>;
template class ConvolutionEngine<half>;
}}}

Просмотреть файл

@ -11,20 +11,21 @@
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
class CuDnnBatchNormEngine : public BatchNormEngine<ElemType>
template <class InoutType, class StatType>
class CuDnnBatchNormEngine : public BatchNormEngine<InoutType, StatType>
{
public:
using Base = BatchNormEngine<ElemType>;
using typename Base::Mat;
using Base = BatchNormEngine<InoutType, StatType>;
using typename Base::InoutMat;
using typename Base::StatMat;
public:
CuDnnBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout)
: Base(deviceId, inOutT, spatial, imageLayout),
m_cudnn(CuDnn::Instance()),
m_inOutCuDnnT(GetInOutTensor(inOutT), CuDnnTensor::GetDataType<ElemType>()),
m_scaleBiasCuDnnT(GetScaleBiasTensor(inOutT, spatial), CuDnnTensor::GetDataType<ElemType>()),
m_inOutCuDnnT(GetInOutTensor(inOutT), CuDnnTensor::GetDataType<InoutType>()),
m_scaleBiasCuDnnT(GetScaleBiasTensor(inOutT, spatial), CuDnnTensor::GetDataType<StatType>()),
m_cudnnEpsilon(CUDNN_BN_MIN_EPSILON)
{
}
@ -43,8 +44,8 @@ protected:
InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions.");
}
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
void ForwardCore(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
InoutMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev) override
{
// TODO batchSize == 1
@ -53,7 +54,8 @@ protected:
InvalidArgument("cuDNN batch normalization engine currently supports blendTimeConstant of 0 or 1 only.");
m_inOutCuDnnT.UpdateBatchSize(in.GetNumCols());
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL_PERSISTENT : CUDNN_BATCHNORM_PER_ACTIVATION;
if (inferenceOnly) mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
// cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
m_cudnnEpsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
if (inferenceOnly)
@ -75,23 +77,26 @@ protected:
}
}
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) override
void BackwardCore(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& savedMean, const StatMat& savedInvStdDev,
StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) override
{
UNUSED(blendFactor); // BUGBUG: It should be used.
m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL_PERSISTENT : CUDNN_BATCHNORM_PER_ACTIVATION;
// REVIEW alexeyk: change betaParamDiff to 1 and update CNTK BN engine.
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, accumulateDataGrad ? &C::One : &C::Zero, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), m_cudnnEpsilon, ptr(savedMean), ptr(savedInvStdDev)));
}
private:
static ElemType* ptr(Mat& src)
template<typename ElemType>
static ElemType* ptr(Matrix<ElemType>& src)
{
return src.Data();
}
static const ElemType* ptr(const Mat& src)
template<typename ElemType>
static const ElemType* ptr(const Matrix<ElemType>& src)
{
return src.Data();
}
@ -123,7 +128,7 @@ private:
}
private:
using C = Consts<ElemType>;
using C = Consts<StatType>;
CuDnn::ptr_t m_cudnn;
CuDnnTensor m_inOutCuDnnT;
@ -131,18 +136,20 @@ private:
double m_cudnnEpsilon;
};
template class CuDnnBatchNormEngine<float>;
template class CuDnnBatchNormEngine<double>;
template class CuDnnBatchNormEngine<float, float>;
template class CuDnnBatchNormEngine<double, double>;
template class CuDnnBatchNormEngine<half, float>;
template <typename ElemType>
std::unique_ptr<BatchNormEngine<ElemType>> CuDnnBatchNormEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
template <typename InoutType, typename StatType>
std::unique_ptr<BatchNormEngine<InoutType, StatType>> CuDnnBatchNormEngineFactory<InoutType, StatType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout)
{
return std::make_unique<CuDnnBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
return std::make_unique<CuDnnBatchNormEngine<InoutType, StatType>>(deviceId, inOutT, spatial, imageLayout);
}
template class CuDnnBatchNormEngineFactory<float>;
template class CuDnnBatchNormEngineFactory<double>;
template class CuDnnBatchNormEngineFactory<float, float>;
template class CuDnnBatchNormEngineFactory<double, double>;
template class CuDnnBatchNormEngineFactory<half, float>;
CudaTimer::~CudaTimer()
{

Просмотреть файл

@ -6,6 +6,7 @@
#include "stdafx.h"
#include "GPUMatrix.h"
#include "CuDnnCommon.h"
#include "half.hpp"
namespace Microsoft { namespace MSR { namespace CNTK {
#ifndef CPUONLY
@ -23,6 +24,10 @@ const float Consts<float>::Zero = 0;
template <>
const double Consts<double>::Zero = 0;
const float Consts<half>::Zero = 0;
const float Consts<half>::One = 1;
CuDnnTensor::CuDnnTensor()
: m_tensor(nullptr)
{
@ -31,7 +36,7 @@ CuDnnTensor::CuDnnTensor()
CuDnnTensor::CuDnnTensor(const TensorShape& src, cudnnDataType_t dataType)
: m_tensor(nullptr)
{
Set(src, dataType);
Set(src, dataType);
}
CuDnnTensor::~CuDnnTensor()
@ -85,12 +90,15 @@ cudnnDataType_t CuDnnTensor::GetDataType()
return CUDNN_DATA_FLOAT;
else if (typeid(ElemType) == typeid(double))
return CUDNN_DATA_DOUBLE;
else if (typeid(ElemType) == typeid(half))
return CUDNN_DATA_HALF;
else
InvalidArgument("cuDNN engine currently supports only single and double precision data types.");
}
template cudnnDataType_t CuDnnTensor::GetDataType<float>();
template cudnnDataType_t CuDnnTensor::GetDataType<double>();
template cudnnDataType_t CuDnnTensor::GetDataType<half>();
CuDnn::ptr_t CuDnn::Instance()
{

Просмотреть файл

@ -12,6 +12,7 @@
#error CNTK requires the NVIDIA cuDNN library 5.0 or higher to build, cf. https://docs.microsoft.com/en-us/cognitive-toolkit/Setup-CNTK-on-Windows#cudnn or https://docs.microsoft.com/en-us/cognitive-toolkit/Setup-CNTK-on-Linux#cudnn for installation instructions.
#endif
#include <memory>
#include "half.hpp"
namespace Microsoft { namespace MSR { namespace CNTK {
@ -51,4 +52,11 @@ struct Consts
static const ElemType One;
};
template <>
struct Consts<half>
{
static const float Zero;
static const float One;
};
} } }

Просмотреть файл

@ -1,4 +1,3 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
@ -11,6 +10,10 @@
#include <typeinfo>
#include <typeindex>
#include "CuDnnCommon.h"
#include "half.hpp"
// We want tensor core be enabled in order to get(v7)/find tensor core results. But if algo without tensorcore is faster, the only way to force faster algo is to turn it off. Since re-tuning can happen quite often in CNTK, it gets bad if we don't do it carefully. It also require move to get_v7 and we can't test until we can run fp16.
// For now, let's keep it simple and enable tensor core all the time for fp16.
template <>
const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
@ -26,11 +29,6 @@ const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
namespace Microsoft { namespace MSR { namespace CNTK {
static bool IsGpu(DEVICEID_TYPE deviceId)
{
return deviceId >= 0;
}
class CuDnnKernel
{
public:
@ -54,6 +52,9 @@ public:
// Set map count(aka K) dimension.
dims[0] = (int)mapCount;
dims[1] = (int)filt[filt_size - 1];
int numElems = 1;
for(int i=0; i<(int)dim_size;i++) numElems *= dims[i];
m_isOdd = (numElems%2==1);
CUDNN_CALL(cudnnSetFilterNdDescriptor(m_kernel, dataType, FILTER_FORMAT, (int)dim_size, dims.data()));
}
@ -71,10 +72,16 @@ public:
return m_kernel;
}
bool isOdd()
{
return m_isOdd;
}
DISABLE_COPY_AND_MOVE(CuDnnKernel);
private:
cudnnFilterDescriptor_t m_kernel;
bool m_isOdd;
};
class CuDnnConv
@ -102,7 +109,10 @@ public:
}
CUDNN_CALL(cudnnSetConvolutionNdDescriptor(m_conv, (int)dim_size, pad.data(),
stride.data(), dilation.data(),
CUDNN_CROSS_CORRELATION, dataType));
CUDNN_CROSS_CORRELATION, dataType == CUDNN_DATA_HALF ? CUDNN_DATA_FLOAT : dataType));
// allow tensor core for fp16 by default
if(dataType == CUDNN_DATA_HALF)
CUDNN_CALL(cudnnSetConvolutionMathType(m_conv, CUDNN_TENSOR_OP_MATH));
}
~CuDnnConv()
@ -286,7 +296,7 @@ protected:
calgo = 1; // set count of algorithms
return result;
};
// find workspace size needed to auto-tune all algorithms, as well as the size needed for deterministic algorithm
// find workspace size needed to auto-tune all algorithms, as well as the size needed for deterministic algorithm
auto workspaceSizeFinder = [&, this]() -> cudnnStatus_t
{
size_t tmpSize;
@ -306,6 +316,8 @@ protected:
return err;
};
FindBestAlgo(batchSize, m_fwdAlgo, workspaceSizeFinder, deterministicFinder, finder, staticFinder, workspace);
if(m_dataType == CUDNN_DATA_HALF) CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, m_fwdAlgo.AlgoMathType));
else CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, CUDNN_DEFAULT_MATH));
// Perform forward convolution operation.
CUDNN_CALL(cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out)));
}
@ -369,6 +381,8 @@ protected:
};
FindBestAlgo(batchSize, m_backDataAlgo, workspaceSizeFinder, deterministicFinder, finder, staticFinder, workspace);
// Compute gradients with respect to the output tensor (data).
if(m_dataType == CUDNN_DATA_HALF) CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, m_backDataAlgo.AlgoMathType));
else CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, CUDNN_DEFAULT_MATH));
CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), accumulateGradient ? &C::One : &C::Zero, m_inT, ptr(grad)));
}
@ -396,6 +410,15 @@ protected:
{
if(!noMem)
return cudnnGetConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, workspace.BufferSize(), &algo);
// special case for half/odd filter
if(m_kernelT->isOdd() && m_dataType == CUDNN_DATA_HALF)
{
size_t tmpSize = 0;
algo = (cudnnConvolutionBwdFilterAlgo_t) 1;
auto err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, algo, &tmpSize);
workspace.Resize((tmpSize + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
return err;
}
return cudnnGetConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE, 0, &algo);
};
// find deterministic algorithm
@ -431,6 +454,8 @@ protected:
};
FindBestAlgo(batchSize, m_backFiltAlgo, workspaceSizeFinder, deterministicFinder, finder, staticFinder, workspace);
// Compute gradients with respect to the output tensor (data).
if(m_dataType == CUDNN_DATA_HALF) CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, m_backFiltAlgo.AlgoMathType));
else CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, CUDNN_DEFAULT_MATH));
CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), accumulateGradient ? &C::One : &C::Zero, *m_kernelT, ptr(kernelGrad)));
}
@ -512,7 +537,7 @@ private:
assert(calgo == 1); // only one deterministic algorithm will be returned
algo.RecordAlgoBatchSizeWorkspaceSize(true, (*algoPerf).algo, batchSize, (*algoPerf).memory);
algo.autotuningState = AutotuningState::Running; // no further need for tuning since this is deterministic, directly enter running state
}
}
else
{
// This branch handles two cases: a) When first MB comes through, and b) When input has free dimensions.
@ -556,6 +581,7 @@ private:
assert(calgo > 0);
auto res = algoPerf; // first returned algorithm is the fastest
algo.RecordAlgoBatchSizeWorkspaceSize(true, (*res).algo, batchSize, (*res).memory);
algo.AlgoMathType = (*res).mathType;
algo.autotuningState = AutotuningState::Running;
if (algo.MaxAlgoWorkspaceSize < curSize) // need to shrink the workspace
workspace.Resize((curSize + sizeof(ElemType) - 1) / sizeof(ElemType), 1, 0, false);
@ -573,6 +599,7 @@ private:
assert(calgo > 0);
auto res = algoPerf; // first returned algorithm is the fastest
algo.RecordAlgoBatchSizeWorkspaceSize(true, (*res).algo, batchSize, (*res).memory);
algo.AlgoMathType = (*res).mathType;
algo.autotuningState = AutotuningState::Running;
}
catch (...)
@ -613,7 +640,7 @@ private:
{
typedef T typeT;
ConvAlgoInfo()
: LastBatchAlgoMBSize(0), MaxAlgoMBSize(0), maxMBSizeSeen(0), autotuningState(AutotuningState::Init), MaxAlgoWorkspaceSize(0), LastBatchAlgoWorkspaceSize(0)
: LastBatchAlgoMBSize(0), MaxAlgoMBSize(0), maxMBSizeSeen(0), autotuningState(AutotuningState::Init), MaxAlgoWorkspaceSize(0), LastBatchAlgoWorkspaceSize(0), AlgoMathType(CUDNN_TENSOR_OP_MATH)
{
}
// Variables to stores states
@ -631,6 +658,8 @@ private:
decltype(T::algo) selectedAlgo; // currently selected algorithm
decltype(T::algo) maxAlgo; // algorithm that was selected when the current workspace is allocated
cudnnMathType_t AlgoMathType;
bool NeedAutotuning(size_t batchSize, size_t workspaceSize)
{
// NVIDIA:
@ -687,7 +716,7 @@ std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemT
bool forceDeterministicAlgorithms, bool poolIncludePad,
bool inputHasFreeDimension)
{
return std::make_unique<CuDnnConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind,
return std::make_unique<CuDnnConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind,
forceDeterministicAlgorithms, poolIncludePad, inputHasFreeDimension);
}
@ -741,5 +770,6 @@ bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId
template class CuDnnConvolutionEngineFactory<float>;
template class CuDnnConvolutionEngineFactory<double>;
template class CuDnnConvolutionEngineFactory<half>;
} } }

Просмотреть файл

@ -21,11 +21,11 @@ public:
static bool IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry, PoolKind poolKind);
};
template <class ElemType>
template <class InoutType, class StatType>
class CuDnnBatchNormEngineFactory
{
public:
static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
static std::unique_ptr<BatchNormEngine<InoutType, StatType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout);
};

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше