CNTK support for CUDA 9
CNTK now supports CUDA 9/cuDNN 7. This requires an update to build environment to Ubuntu 16/GCC 5 for Linux, and Visual Studio 2017/VCTools 14.11 for Windows. With CUDA 9, CNTK also added a preview for 16-bit floating point (a.k.a FP16) computation. Please check out the example of FP16 in ResNet50 at /Examples/Image/Classification/ResNet/Python/TrainResNet_ImageNet_Distributed.py Notes on FP16 preview: * FP16 implementation on CPU is not optimized, and it's not supposed to be used in CPU inference directly. User needs to convert the model to 32-bit floating point before running on CPU. * Loss/Criterion for FP16 training needs to be 32bit for accumulation without overflow, using cast function. Please check the example above. * Readers do not have FP16 output unless using numpy to feed data, cast from FP32 to FP16 is needed. Please check the example above. * FP16 gradient aggregation is currently only implemented on GPU using NCCL2. Distributed training with FP16 with MPI is not supported. * FP16 math is a subset of current FP32 implementation. Some model may get Feature Not Implemented exception using FP16. * FP16 is currently not supported in BrainScript. Please use Python for FP16. To setup build and runtime environment on Windows: * Install [Visual Studio 2017](https://www.visualstudio.com/downloads/) with following workloads and components. From command line (use Community version installer as example): vs_community.exe --add Microsoft.VisualStudio.Workload.NativeDesktop --add Microsoft.VisualStudio.Workload.ManagedDesktop --add Microsoft.VisualStudio.Workload.Universal --add Microsoft.Component.PythonTools --add Microsoft.VisualStudio.Component.VC.Tools.14.11 * Install [NVidia CUDA 9](https://developer.nvidia.com/cuda-90-download-archive?target_os=Windows&target_arch=x86_64) * From PowerShell, run: /Tools/devInstall/Windows/DevInstall.ps1 * Start VCTools 14.11 command line, run: cmd /k "%VS2017INSTALLDIR%\VC\Auxiliary\Build\vcvarsall.bat" x64 --vcvars_ver=14.11 * Open /CNTK.sln from the VCTools 14.11 command line. Note that starting CNTK.sln other than VCTools 14.11 command line, would causes CUDA 9 [build error](https://developercommunity.visualstudio.com/content/problem/163758/vs-2017-155-doesnt-support-cuda-9.html). To setup build and runtime environment on Linux using docker, please build Unbuntu 16.04 docker image using Dockerfiles /Tools/docker. For other Linux systems, please refer to the Dockerfiles to setup dependent libraries for CNTK.
This commit is contained in:
Родитель
3765da9ae7
Коммит
3cf3af5df6
|
@ -65,6 +65,7 @@ Makefile text
|
|||
*.asax text
|
||||
|
||||
*.h text
|
||||
*.hpp text
|
||||
*.cpp text
|
||||
*.cc text
|
||||
*.cu text
|
||||
|
|
|
@ -3,19 +3,10 @@
|
|||
<Import Project="$(SolutionDir)\CNTK.Common.props" />
|
||||
<PropertyGroup>
|
||||
<CudaVersion />
|
||||
<CudaVersion Condition="Exists('$(CUDA_PATH_V8_0)') And '$(CudaVersion)' == ''">8.0</CudaVersion>
|
||||
<CudaVersion Condition="Exists('$(CUDA_PATH_V7_5)') And '$(CudaVersion)' == ''">7.5</CudaVersion>
|
||||
|
||||
<NvmlInclude />
|
||||
<NvmlInclude Condition="'$(CudaVersion)' == '7.5'">"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include"</NvmlInclude>
|
||||
<NvmlInclude Condition="'$(CudaVersion)' == '8.0'" />
|
||||
|
||||
<NvmlLibPath />
|
||||
<NvmlLibPath Condition="'$(CudaVersion)' == '7.5'">"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</NvmlLibPath>
|
||||
<NvmlLibPath Condition="'$(CudaVersion)' == '8.0'" />
|
||||
<CudaVersion Condition="Exists('$(CUDA_PATH_V9_0)') And '$(CudaVersion)' == ''">9.0</CudaVersion>
|
||||
|
||||
<NvmlDll>%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml.dll</NvmlDll>
|
||||
<NvmlDll Condition="Exists('c:\local\bindrop\NVSMI\nvml.dll')">c:\local\bindrop\NVSMI\nvml.dll</NvmlDll>
|
||||
<NvmlDll Condition="Exists('c:\local\nvsmi9\NVSMI\nvml.dll')">c:\local\nvsmi9\NVSMI\nvml.dll</NvmlDll>
|
||||
|
||||
<HasOpenCv>false</HasOpenCv>
|
||||
<HasOpenCv Condition="Exists('$(OPENCV_PATH)') Or Exists('$(OPENCV_PATH_V31)')">true</HasOpenCv>
|
||||
|
@ -65,16 +56,20 @@
|
|||
|
||||
<PropertyGroup Condition="!$(IsUWP)">
|
||||
<MathLibrary>MKL</MathLibrary>
|
||||
<MathIncludePath>$(MKLML_PATH)\include</MathIncludePath>
|
||||
<MathIncludePath>$(MKL_PATH)\include</MathIncludePath>
|
||||
<MathDefine>USE_MKL</MathDefine>
|
||||
<!-- Only non-UWP configurations consume PerformanceProfiler -->
|
||||
<ReaderLibs>Cntk.PerformanceProfiler-$(CntkComponentVersion).lib;$(ReaderLibs)</ReaderLibs>
|
||||
<MathLibraryName>MKL-ML Library</MathLibraryName>
|
||||
<MathLibraryPath>$(MKLML_PATH)\lib</MathLibraryPath>
|
||||
<MathLibraryName>MKL Library</MathLibraryName>
|
||||
<MathLibraryPath>$(MKL_PATH)\lib</MathLibraryPath>
|
||||
<MathLinkLibrary>mklml.lib</MathLinkLibrary>
|
||||
<MathDelayLoad>mklml.dll</MathDelayLoad>
|
||||
<MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
|
||||
<UnitTestDlls>$(OutDir)mklml.lib;$(OutDir)libiomp5md.dll;</UnitTestDlls>
|
||||
<HasMklDnn>false</HasMklDnn>
|
||||
<HasMklDnn Condition="Exists('$(MKL_PATH)\include\mkldnn.h')">true</HasMklDnn>
|
||||
<MathDefine Condition="$(HasMklDnn)">$(MathDefine);USE_MKLDNN</MathDefine>
|
||||
<MathLinkLibrary Condition="$(HasMklDnn)">$(MathLinkLibrary);mkldnn.lib</MathLinkLibrary>
|
||||
<MathDelayLoad Condition="$(HasMklDnn)">$(MathDelayLoad);mkldnn.dll</MathDelayLoad>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="$(UseZip)">
|
||||
<ZipInclude>$(ZLIB_PATH)\include;$(ZLIB_PATH)\lib\libzip\include;</ZipInclude>
|
||||
|
@ -109,10 +104,10 @@
|
|||
<ProtobufLib Condition="$(DebugBuild)">libprotobufd.lib</ProtobufLib>
|
||||
</PropertyGroup>
|
||||
|
||||
<PropertyGroup Condition="'$(CudaVersion)' == '8.0'">
|
||||
<CudaPath>$(CUDA_PATH_V8_0)</CudaPath>
|
||||
<CudaRuntimeDll>cudart64_80.dll</CudaRuntimeDll>
|
||||
<CudaDlls>cublas64_80.dll;cusparse64_80.dll;curand64_80.dll;$(CudaRuntimeDll)</CudaDlls>
|
||||
<PropertyGroup Condition="'$(CudaVersion)' == '9.0'">
|
||||
<CudaPath>$(CUDA_PATH_V9_0)</CudaPath>
|
||||
<CudaRuntimeDll>cudart64_90.dll</CudaRuntimeDll>
|
||||
<CudaDlls>cublas64_90.dll;cusparse64_90.dll;curand64_90.dll;$(CudaRuntimeDll)</CudaDlls>
|
||||
|
||||
<!-- Use NvidiaCompute to define nvcc target architectures (will generate code to support them all, i.e. fat-binary, in release mode)
|
||||
In debug mode we only include cubin/PTX for 30 and rely on PTX / JIT to generate the required native cubin format
|
||||
|
@ -121,19 +116,7 @@
|
|||
<NvidiaCompute Condition="$(DebugBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30</NvidiaCompute>
|
||||
|
||||
<NvidiaCompute Condition="$(ReleaseBuild)">$(CNTK_CUDA_CODEGEN_RELEASE)</NvidiaCompute>
|
||||
<NvidiaCompute Condition="$(ReleaseBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_60,sm_60;compute_61,sm_61</NvidiaCompute>
|
||||
</PropertyGroup>
|
||||
|
||||
<PropertyGroup Condition="'$(CudaVersion)' == '7.5'">
|
||||
<CudaPath>$(CUDA_PATH_V7_5)</CudaPath>
|
||||
<CudaRuntimeDll>cudart64_75.dll</CudaRuntimeDll>
|
||||
<CudaDlls>cublas64_75.dll;cusparse64_75.dll;curand64_75.dll;$(CudaRuntimeDll)</CudaDlls>
|
||||
|
||||
<NvidiaCompute Condition="$(DebugBuild)">$(CNTK_CUDA_CODEGEN_DEBUG)</NvidiaCompute>
|
||||
<NvidiaCompute Condition="$(DebugBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30</NvidiaCompute>
|
||||
|
||||
<NvidiaCompute Condition="$(ReleaseBuild)">$(CNTK_CUDA_CODEGEN_RELEASE)</NvidiaCompute>
|
||||
<NvidiaCompute Condition="$(ReleaseBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30;compute_35,sm_35;compute_50,sm_50</NvidiaCompute>
|
||||
<NvidiaCompute Condition="$(ReleaseBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70</NvidiaCompute>
|
||||
</PropertyGroup>
|
||||
|
||||
<PropertyGroup>
|
||||
|
@ -144,11 +127,14 @@
|
|||
<CudaMsbuildPath Condition="'$(CudaMsbuildPath)' == ''">$(VCTargetsPath)\BuildCustomizations</CudaMsbuildPath>
|
||||
</PropertyGroup>
|
||||
|
||||
<PropertyGroup>
|
||||
<PlatformToolset>v141</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
|
||||
<!-- TODO warn if ConfigurationType not (yet) defined -->
|
||||
|
||||
<PropertyGroup Condition="'$(ConfigurationType)' == 'StaticLibrary'">
|
||||
<UseDebugLibraries>$(DebugBuild)</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
<WholeProgramOptimization>$(ReleaseBuild)</WholeProgramOptimization>
|
||||
<LinkIncremental>$(DebugBuild)</LinkIncremental>
|
||||
|
@ -159,6 +145,7 @@
|
|||
<PreprocessorDefinitions>CNTK_COMPONENT_VERSION="$(CntkComponentVersion)"</PreprocessorDefinitions>
|
||||
<!-- UWP does not use MPI -->
|
||||
<PreprocessorDefinitions Condition="!$(IsUWP)">%(PreprocessorDefinitions);HAS_MPI=1</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions Condition="'$(CudaVersion)' == '9.0'">%(PreprocessorDefinitions);CUDA_NO_HALF;__CUDA_NO_HALF_OPERATORS__</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
</ItemDefinitionGroup>
|
||||
|
||||
|
|
780
CNTK.sln
780
CNTK.sln
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
@ -39,13 +39,11 @@
|
|||
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
@ -39,13 +39,11 @@
|
|||
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
@ -39,13 +39,11 @@
|
|||
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
@ -42,13 +42,11 @@
|
|||
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
|
|
|
@ -56,7 +56,7 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
|
|||
|
||||
# Train and evaluate the network.
|
||||
def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_epochs, profiler_dir=None,
|
||||
model_dir=None, log_dir=None, tensorboard_logdir=None, gen_heartbeat=False):
|
||||
model_dir=None, log_dir=None, tensorboard_logdir=None, gen_heartbeat=False, fp16=False):
|
||||
|
||||
set_computation_network_trace_level(0)
|
||||
|
||||
|
@ -64,19 +64,32 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
|
|||
input_var = C.input_variable((num_channels, image_height, image_width), name='features')
|
||||
label_var = C.input_variable((num_classes))
|
||||
|
||||
dtype = np.float16 if fp16 else np.float32
|
||||
if fp16:
|
||||
graph_input = C.cast(input_var, dtype=np.float16)
|
||||
graph_label = C.cast(label_var, dtype=np.float16)
|
||||
else:
|
||||
graph_input = input_var
|
||||
graph_label = label_var
|
||||
|
||||
with C.default_options(dtype=dtype):
|
||||
# create model, and configure learning parameters
|
||||
if network_name == 'resnet20':
|
||||
z = create_cifar10_model(input_var, 3, num_classes)
|
||||
z = create_cifar10_model(graph_input, 3, num_classes)
|
||||
lr_per_mb = [1.0]*80 + [0.1]*40 + [0.01]
|
||||
elif network_name == 'resnet110':
|
||||
z = create_cifar10_model(input_var, 18, num_classes)
|
||||
z = create_cifar10_model(graph_input, 18, num_classes)
|
||||
lr_per_mb = [0.1]*1 + [1.0]*80 + [0.1]*40 + [0.01]
|
||||
else:
|
||||
raise RuntimeError("Unknown model name!")
|
||||
|
||||
# loss and metric
|
||||
ce = cross_entropy_with_softmax(z, label_var)
|
||||
pe = classification_error(z, label_var)
|
||||
ce = cross_entropy_with_softmax(z, graph_label)
|
||||
pe = classification_error(z, graph_label)
|
||||
|
||||
if fp16:
|
||||
ce = C.cast(ce, dtype=np.float32)
|
||||
pe = C.cast(pe, dtype=np.float32)
|
||||
|
||||
# shared training parameters
|
||||
minibatch_size = 128
|
||||
|
@ -168,6 +181,7 @@ if __name__=='__main__':
|
|||
parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
|
||||
parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
|
||||
parser.add_argument('-genheartbeat', '--genheartbeat', help="Turn on heart-beat for philly", action='store_true', default=False)
|
||||
parser.add_argument('-fp16', '--fp16', help="use float16", action='store_true', default=False)
|
||||
|
||||
args = vars(parser.parse_args())
|
||||
epochs = args['epochs']
|
||||
|
@ -184,4 +198,4 @@ if __name__=='__main__':
|
|||
reader_test = create_image_mb_source(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False, total_number_of_samples=C.io.FULL_DATA_SWEEP)
|
||||
|
||||
train_and_evaluate(reader_train, reader_test, network_name, epoch_size, epochs, args['profiler_dir'], model_dir,
|
||||
args['logdir'], args['tensorboard_logdir'], gen_heartbeat=args['genheartbeat'])
|
||||
args['logdir'], args['tensorboard_logdir'], gen_heartbeat=args['genheartbeat'], fp16=args['fp16'])
|
||||
|
|
|
@ -37,22 +37,35 @@ num_classes = 10
|
|||
model_name = "ResNet_CIFAR10_DataAug.model"
|
||||
|
||||
# Create network
|
||||
def create_resnet_network(network_name):
|
||||
def create_resnet_network(network_name, fp16):
|
||||
# Input variables denoting the features and label data
|
||||
input_var = C.input_variable((num_channels, image_height, image_width))
|
||||
label_var = C.input_variable((num_classes))
|
||||
|
||||
dtype = np.float16 if fp16 else np.float32
|
||||
if fp16:
|
||||
graph_input = C.cast(input_var, dtype=np.float16)
|
||||
graph_label = C.cast(label_var, dtype=np.float16)
|
||||
else:
|
||||
graph_input = input_var
|
||||
graph_label = label_var
|
||||
|
||||
with C.default_options(dtype=dtype):
|
||||
# create model, and configure learning parameters
|
||||
if network_name == 'resnet20':
|
||||
z = create_cifar10_model(input_var, 3, num_classes)
|
||||
z = create_cifar10_model(graph_input, 3, num_classes)
|
||||
elif network_name == 'resnet110':
|
||||
z = create_cifar10_model(input_var, 18, num_classes)
|
||||
z = create_cifar10_model(graph_input, 18, num_classes)
|
||||
else:
|
||||
return RuntimeError("Unknown model name!")
|
||||
|
||||
# loss and metric
|
||||
ce = cross_entropy_with_softmax(z, label_var)
|
||||
pe = classification_error(z, label_var)
|
||||
ce = cross_entropy_with_softmax(z, graph_label)
|
||||
pe = classification_error(z, graph_label)
|
||||
|
||||
if fp16:
|
||||
ce = C.cast(ce, dtype=np.float32)
|
||||
pe = C.cast(pe, dtype=np.float32)
|
||||
|
||||
return {
|
||||
'name' : network_name,
|
||||
|
@ -120,7 +133,7 @@ def train_and_test(network, trainer, train_source, test_source, minibatch_size,
|
|||
|
||||
# Train and evaluate the network.
|
||||
def resnet_cifar10(train_data, test_data, mean_data, network_name, epoch_size, num_quantization_bits=32, block_size=None, warm_up=0,
|
||||
max_epochs=160, restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False, scale_up=False, profiling=False):
|
||||
max_epochs=160, restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False, scale_up=False, profiling=False, fp16=False):
|
||||
|
||||
set_computation_network_trace_level(0)
|
||||
|
||||
|
@ -138,7 +151,7 @@ def resnet_cifar10(train_data, test_data, mean_data, network_name, epoch_size, n
|
|||
gen_heartbeat=gen_heartbeat,
|
||||
num_epochs=max_epochs)
|
||||
|
||||
network = create_resnet_network(network_name)
|
||||
network = create_resnet_network(network_name, fp16)
|
||||
trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer)
|
||||
train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
|
||||
test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP)
|
||||
|
@ -162,6 +175,7 @@ if __name__=='__main__':
|
|||
parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true')
|
||||
parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
|
||||
parser.add_argument('-profile', '--profile', help="Turn on profiling", action='store_true', default=False)
|
||||
parser.add_argument('-fp16', '--fp16', help="use float16", action='store_true', default=False)
|
||||
|
||||
args = vars(parser.parse_args())
|
||||
|
||||
|
@ -201,7 +215,8 @@ if __name__=='__main__':
|
|||
restore=not args['restart'],
|
||||
scale_up=scale_up,
|
||||
log_to_file=args['logdir'],
|
||||
profiling=args['profile'])
|
||||
profiling=args['profile'],
|
||||
fp16=args['fp16'])
|
||||
|
||||
# Must call MPI finalize when process exit without exceptions
|
||||
Communicator.finalize()
|
||||
|
|
|
@ -63,32 +63,46 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
|
|||
multithreaded_deserializer=True)
|
||||
|
||||
# Create network
|
||||
def create_resnet_network(network_name):
|
||||
def create_resnet_network(network_name, fp16):
|
||||
# Input variables denoting the features and label data
|
||||
input_var = C.input_variable((num_channels, image_height, image_width))
|
||||
label_var = C.input_variable((num_classes))
|
||||
|
||||
dtype = np.float16 if fp16 else np.float32
|
||||
if fp16:
|
||||
graph_input = C.cast(input_var, dtype=np.float16)
|
||||
graph_label = C.cast(label_var, dtype=np.float16)
|
||||
else:
|
||||
graph_input = input_var
|
||||
graph_label = label_var
|
||||
|
||||
with C.default_options(dtype=dtype):
|
||||
stride1x1 = (1, 1)
|
||||
stride3x3 = (2, 2)
|
||||
|
||||
# create model, and configure learning parameters
|
||||
if network_name == 'resnet18':
|
||||
z = create_imagenet_model_basic(input_var, [2, 1, 1, 2], num_classes)
|
||||
z = create_imagenet_model_basic(graph_input, [2, 1, 1, 2], num_classes)
|
||||
elif network_name == 'resnet34':
|
||||
z = create_imagenet_model_basic(input_var, [3, 3, 5, 2], num_classes)
|
||||
z = create_imagenet_model_basic(graph_input, [3, 3, 5, 2], num_classes)
|
||||
elif network_name == 'resnet50':
|
||||
z = create_imagenet_model_bottleneck(input_var, [2, 3, 5, 2], num_classes, stride1x1, stride3x3)
|
||||
z = create_imagenet_model_bottleneck(graph_input, [2, 3, 5, 2], num_classes, stride1x1, stride3x3)
|
||||
elif network_name == 'resnet101':
|
||||
z = create_imagenet_model_bottleneck(input_var, [2, 3, 22, 2], num_classes, stride1x1, stride3x3)
|
||||
z = create_imagenet_model_bottleneck(graph_input, [2, 3, 22, 2], num_classes, stride1x1, stride3x3)
|
||||
elif network_name == 'resnet152':
|
||||
z = create_imagenet_model_bottleneck(input_var, [2, 7, 35, 2], num_classes, stride1x1, stride3x3)
|
||||
z = create_imagenet_model_bottleneck(graph_input, [2, 7, 35, 2], num_classes, stride1x1, stride3x3)
|
||||
else:
|
||||
return RuntimeError("Unknown model name!")
|
||||
|
||||
# loss and metric
|
||||
ce = cross_entropy_with_softmax(z, label_var)
|
||||
errs = classification_error(z, label_var, topN=1)
|
||||
top5Errs = classification_error(z, label_var, topN=5)
|
||||
ce = cross_entropy_with_softmax(z, graph_label)
|
||||
errs = classification_error(z, graph_label, topN=1)
|
||||
top5Errs = classification_error(z, graph_label, topN=5)
|
||||
|
||||
if fp16:
|
||||
ce = C.cast(ce, dtype=np.float32)
|
||||
errs = C.cast(errs, dtype=np.float32)
|
||||
top5Errs = C.cast(top5Errs, dtype=np.float32)
|
||||
|
||||
return {
|
||||
'name' : network_name,
|
||||
|
@ -156,7 +170,7 @@ def train_and_test(network, trainer, train_source, test_source, minibatch_size,
|
|||
|
||||
# Train and evaluate the network.
|
||||
def resnet_imagenet(train_data, test_data, mean_data, network_name, epoch_size, num_quantization_bits=32, block_size=None, warm_up=0,
|
||||
max_epochs=90, restore=True, log_to_file=None, num_mbs_per_log=100, gen_heartbeat=False, scale_up=False, profiling=False):
|
||||
max_epochs=90, restore=True, log_to_file=None, num_mbs_per_log=100, gen_heartbeat=False, scale_up=False, profiling=False, fp16=False):
|
||||
|
||||
set_computation_network_trace_level(0)
|
||||
|
||||
|
@ -174,7 +188,7 @@ def resnet_imagenet(train_data, test_data, mean_data, network_name, epoch_size,
|
|||
gen_heartbeat=gen_heartbeat,
|
||||
num_epochs=max_epochs)
|
||||
|
||||
network = create_resnet_network(network_name)
|
||||
network = create_resnet_network(network_name, fp16)
|
||||
trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer)
|
||||
train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
|
||||
test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP)
|
||||
|
@ -198,6 +212,7 @@ if __name__=='__main__':
|
|||
parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true', default=False)
|
||||
parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
|
||||
parser.add_argument('-profile', '--profile', help="Turn on profiling", action='store_true', default=False)
|
||||
parser.add_argument('-fp16', '--fp16', help="use float16", action='store_true', default=False)
|
||||
|
||||
args = vars(parser.parse_args())
|
||||
|
||||
|
@ -237,7 +252,8 @@ if __name__=='__main__':
|
|||
restore=not args['restart'],
|
||||
scale_up=scale_up,
|
||||
log_to_file=args['logdir'],
|
||||
profiling=args['profile'])
|
||||
profiling=args['profile'],
|
||||
fp16=args['fp16'])
|
||||
|
||||
# Must call MPI finalize when process exit without exceptions
|
||||
Communicator.finalize()
|
||||
|
|
|
@ -11,8 +11,6 @@
|
|||
<OutputPath>.</OutputPath>
|
||||
<ProjectTypeGuids>{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
|
||||
<LaunchProvider>Standard Python launcher</LaunchProvider>
|
||||
<InterpreterId>{4aa09c20-bd30-41b0-b691-0beca389326c}</InterpreterId>
|
||||
<InterpreterVersion>3.5</InterpreterVersion>
|
||||
<EnableNativeCodeDebugging>False</EnableNativeCodeDebugging>
|
||||
<IsWindowsApplication>False</IsWindowsApplication>
|
||||
</PropertyGroup>
|
||||
|
@ -20,7 +18,6 @@
|
|||
<PropertyGroup Condition="'$(Configuration)' == 'Release'" />
|
||||
<PropertyGroup>
|
||||
<VisualStudioVersion Condition=" '$(VisualStudioVersion)' == '' ">10.0</VisualStudioVersion>
|
||||
<PtvsTargetsFile>$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets</PtvsTargetsFile>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Compile Include="..\bindings\python\cntk\ops\tests\block_test.py">
|
||||
|
@ -280,6 +277,9 @@
|
|||
<ItemGroup>
|
||||
<Folder Include="1stSteps\" />
|
||||
<Folder Include="common\" />
|
||||
<Folder Include="dToEndTests\" />
|
||||
<Folder Include="dToEndTests\cntkv2Python\" />
|
||||
<Folder Include="dToEndTests\cntkv2Python\Examples\" />
|
||||
<Folder Include="Image\" />
|
||||
<Folder Include="Image\Classification\" />
|
||||
<Folder Include="Image\Classification\ConvNet\" />
|
||||
|
@ -343,10 +343,6 @@
|
|||
<Folder Include="ToEndTests\cntkv2Python\" />
|
||||
<Folder Include="ToEndTests\cntkv2Python\Examples\" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<InterpreterReference Include="{4aa09c20-bd30-41b0-b691-0beca389326c}\3.5" />
|
||||
<InterpreterReference Include="{c2abc240-2da9-4409-8b09-0ee30a5b56e2}\3.4" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Content Include="Text\CharacterLM\data\tinyshakespeare.txt" />
|
||||
<Content Include="Text\CharacterLM\README.md" />
|
||||
|
@ -355,6 +351,5 @@
|
|||
<Content Include="Text\WordLMWithSampledSoftmax\test\text.txt" />
|
||||
<Content Include="Text\WordLMWithSampledSoftmax\test\token2id.txt" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(PtvsTargetsFile)" Condition="Exists($(PtvsTargetsFile))" />
|
||||
<Import Project="$(MSBuildToolsPath)\Microsoft.Common.targets" Condition="!Exists($(PtvsTargetsFile))" />
|
||||
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
|
||||
</Project>
|
14
Makefile
14
Makefile
|
@ -90,7 +90,7 @@ SOURCEDIR:= Source
|
|||
INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2LibraryDll/API CNTKv2LibraryDll/proto ../Examples/Extensibility/CPP Math CNTK ActionsLib ComputationNetworkLib SGDLib SequenceTrainingLib CNTK/BrainScript Readers/ReaderLib PerformanceProfilerDll)
|
||||
INCLUDEPATH+=$(PROTOBUF_PATH)/include
|
||||
# COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
|
||||
COMMON_FLAGS:= -DHAS_MPI=$(HAS_MPI) -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
|
||||
COMMON_FLAGS:= -DHAS_MPI=$(HAS_MPI) -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11 -DCUDA_NO_HALF -D__CUDA_NO_HALF_OPERATORS__
|
||||
CPPFLAGS:=
|
||||
CXXFLAGS:= $(SSE_FLAGS) -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
|
||||
LIBPATH:=
|
||||
|
@ -168,10 +168,10 @@ endif
|
|||
|
||||
ifeq ("$(MATHLIB)","mkl")
|
||||
INCLUDEPATH += $(MKL_PATH)/include
|
||||
LIBS_LIST += m iomp5 pthread mklml_intel
|
||||
LIBS_LIST += m iomp5 pthread mklml_intel mkldnn
|
||||
MKL_LIB_PATH := $(MKL_PATH)/lib
|
||||
LIBPATH += $(MKL_LIB_PATH)
|
||||
COMMON_FLAGS += -DUSE_MKL
|
||||
COMMON_FLAGS += -DUSE_MKL -DUSE_MKLDNN
|
||||
endif
|
||||
|
||||
ifeq ($(CUDA_GDR),1)
|
||||
|
@ -212,6 +212,7 @@ GENCODE_SM50 := -gencode arch=compute_50,code=\"sm_50,compute_50\"
|
|||
GENCODE_SM52 := -gencode arch=compute_52,code=\"sm_52,compute_52\"
|
||||
GENCODE_SM60 := -gencode arch=compute_60,code=\"sm_60,compute_60\"
|
||||
GENCODE_SM61 := -gencode arch=compute_61,code=\"sm_61,compute_61\"
|
||||
GENCODE_SM70 := -gencode arch=compute_70,code=\"sm_70,compute_70\"
|
||||
|
||||
# Should we relocate *.gcno and *.gcda files using -fprofile-dir option?
|
||||
# Use GCOV_PREFIX and GCOV_PREFIX_STRIP if relocating:
|
||||
|
@ -241,7 +242,7 @@ ifeq ("$(BUILDTYPE)","release")
|
|||
ifdef CNTK_CUDA_CODEGEN_RELEASE
|
||||
GENCODE_FLAGS := $(CNTK_CUDA_CODEGEN_RELEASE)
|
||||
else
|
||||
GENCODE_FLAGS := $(GENCODE_SM30) $(GENCODE_SM35) $(GENCODE_SM50) $(GENCODE_SM60) $(GENCODE_SM61)
|
||||
GENCODE_FLAGS := $(GENCODE_SM30) $(GENCODE_SM35) $(GENCODE_SM50) $(GENCODE_SM60) $(GENCODE_SM61) $(GENCODE_SM70)
|
||||
endif
|
||||
|
||||
CXXFLAGS += -g -O4
|
||||
|
@ -365,6 +366,10 @@ MATH_SRC =\
|
|||
$(SOURCEDIR)/Math/CUDAPageLockedMemAllocator.cpp \
|
||||
$(SOURCEDIR)/Math/CPUMatrixFloat.cpp \
|
||||
$(SOURCEDIR)/Math/CPUMatrixDouble.cpp \
|
||||
$(SOURCEDIR)/Math/CPUMatrixHalf.cpp \
|
||||
$(SOURCEDIR)/Math/CPUMatrixTensorFloat.cpp \
|
||||
$(SOURCEDIR)/Math/CPUMatrixTensorDouble.cpp \
|
||||
$(SOURCEDIR)/Math/CPUMatrixTensorHalf.cpp \
|
||||
$(SOURCEDIR)/Math/CPURNGHandle.cpp \
|
||||
$(SOURCEDIR)/Math/CPUSparseMatrix.cpp \
|
||||
$(SOURCEDIR)/Math/ConvolutionEngine.cpp \
|
||||
|
@ -1322,6 +1327,7 @@ UNITTEST_MATH_SRC = \
|
|||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixLearnerTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/HalfGPUTests.cpp \
|
||||
|
||||
UNITTEST_MATH_SRC += $(CNTK_COMMON_SRC)
|
||||
UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))
|
||||
|
|
26
README.md
26
README.md
|
@ -2,6 +2,32 @@
|
|||
|
||||
## Latest news
|
||||
|
||||
***2018-01-22.*** CNTK support for CUDA 9
|
||||
|
||||
CNTK now supports CUDA 9/cuDNN 7. This requires an update to build environment to Ubuntu 16/GCC 5 for Linux, and Visual Studio 2017/VCTools 14.11 for Windows. With CUDA 9, CNTK also added a preview for 16-bit floating point (a.k.a FP16) computation.
|
||||
|
||||
Please check out the example of FP16 in ResNet50 [here](./Examples/Image/Classification/ResNet/Python/TrainResNet_ImageNet_Distributed.py)
|
||||
|
||||
Notes on FP16 preview:
|
||||
* FP16 implementation on CPU is not optimized, and it's not supposed to be used in CPU inference directly. User needs to convert the model to 32-bit floating point before running on CPU.
|
||||
* Loss/Criterion for FP16 training needs to be 32bit for accumulation without overflow, using cast function. Please check the example above.
|
||||
* Readers do not have FP16 output unless using numpy to feed data, cast from FP32 to FP16 is needed. Please check the example above.
|
||||
* FP16 gradient aggregation is currently only implemented on GPU using NCCL2. Distributed training with FP16 with MPI is not supported.
|
||||
* FP16 math is a subset of current FP32 implementation. Some model may get Feature Not Implemented exception using FP16.
|
||||
* FP16 is currently not supported in BrainScript. Please use Python for FP16.
|
||||
|
||||
To setup build and runtime environment on Windows:
|
||||
* Install [Visual Studio 2017](https://www.visualstudio.com/downloads/) with following workloads and components. From command line (use Community version installer as example):
|
||||
vs_community.exe --add Microsoft.VisualStudio.Workload.NativeDesktop --add Microsoft.VisualStudio.Workload.ManagedDesktop --add Microsoft.VisualStudio.Workload.Universal --add Microsoft.Component.PythonTools --add Microsoft.VisualStudio.Component.VC.Tools.14.11
|
||||
* Install [NVidia CUDA 9](https://developer.nvidia.com/cuda-90-download-archive?target_os=Windows&target_arch=x86_64)
|
||||
* From PowerShell, run:
|
||||
[DevInstall.ps1](./Tools/devInstall/Windows/DevInstall.ps1)
|
||||
* Start VCTools 14.11 command line, run:
|
||||
cmd /k "%VS2017INSTALLDIR%\VC\Auxiliary\Build\vcvarsall.bat" x64 --vcvars_ver=14.11
|
||||
* Open [CNTK.sln](./CNTK.sln) from the VCTools 14.11 command line. Note that starting CNTK.sln other than VCTools 14.11 command line, would causes CUDA 9 [build error](https://developercommunity.visualstudio.com/content/problem/163758/vs-2017-155-doesnt-support-cuda-9.html).
|
||||
|
||||
To setup build and runtime environment on Linux using docker, please build Unbuntu 16.04 docker image using Dockerfiles [here](./Tools/docker). For other Linux systems, please refer to the Dockerfiles to setup dependent libraries for CNTK.
|
||||
|
||||
***2017-12-05.* CNTK 2.3.1**
|
||||
Release of Cognitive Toolkit v.2.3.1.
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 70d0c096a11e3b370d1b529f098f9711a7966090
|
||||
Subproject commit fb72ecb8c4d4e1d2e59b200ad1092331a372edea
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
|
|
@ -291,7 +291,7 @@ public:
|
|||
|
||||
// map all to lowercase
|
||||
std::wstring lvalue = std::wstring(value.begin(), value.end());
|
||||
std::transform(lvalue.begin(), lvalue.end(), lvalue.begin(), ::tolower); // note: may crash for chars >127. Don't use those.
|
||||
std::transform(lvalue.begin(), lvalue.end(), lvalue.begin(), [](wchar_t c) { return (wchar_t)::tolower(c); }); // note: may crash for chars >127. Don't use those.
|
||||
|
||||
// add to the respective node group
|
||||
m_net->AddToNodeGroup(lvalue, compNode);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
@ -33,13 +33,11 @@
|
|||
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
#endif
|
||||
|
||||
#include "CNTKLibraryInternals.h"
|
||||
#include "HalfConverter.hpp"
|
||||
|
||||
// undef max in the rest of the file to avoid conflicts with the max macro defined in windows.h.
|
||||
#pragma push_macro("max")
|
||||
|
@ -38,6 +39,43 @@
|
|||
|
||||
namespace CNTK
|
||||
{
|
||||
class float16
|
||||
{
|
||||
protected:
|
||||
unsigned short __x;
|
||||
|
||||
public:
|
||||
float16() = default;
|
||||
float16(const float16& other) { __x = other.__x; }
|
||||
|
||||
#ifndef SWIG
|
||||
// construction from build-in types
|
||||
float16(float f) { floatToFloat16(&f, &__x); }
|
||||
float16(double d) : float16((float)d) {}
|
||||
float16(int i) : float16((float)i) {}
|
||||
float16(size_t u) : float16((float)u) {}
|
||||
|
||||
// cast to build-in types
|
||||
operator float() const { float f; float16ToFloat(&__x, &f); return f; }
|
||||
|
||||
// compare functions
|
||||
inline bool operator==(const float16& rhs) const { return (__x == rhs.__x); }
|
||||
inline bool operator!=(const float16& rhs) const { return (__x != rhs.__x); }
|
||||
#endif
|
||||
|
||||
static float16 create(float f)
|
||||
{
|
||||
float16 v;
|
||||
floatToFloat16(&f, &v.__x);
|
||||
return v;
|
||||
}
|
||||
|
||||
static float16 create(double d)
|
||||
{
|
||||
return create((float)d);
|
||||
}
|
||||
};
|
||||
|
||||
///
|
||||
/// Enumeration type denoting data type of symbolic data entities or actual data.
|
||||
///
|
||||
|
@ -47,6 +85,7 @@ namespace CNTK
|
|||
Float = 1,
|
||||
Double = 2,
|
||||
UChar = 3, // So far only used internally in deserializers.
|
||||
Float16 = 4,
|
||||
|
||||
/* TODO:
|
||||
Bit,
|
||||
|
@ -74,6 +113,8 @@ namespace CNTK
|
|||
return DataType::Float;
|
||||
else if (std::is_same<ElementType, double>())
|
||||
return DataType::Double;
|
||||
else if (std::is_same<ElementType, float16>())
|
||||
return DataType::Float16;
|
||||
else
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
|
@ -84,6 +125,8 @@ namespace CNTK
|
|||
return "Float";
|
||||
else if (dataType == DataType::Double)
|
||||
return "Double";
|
||||
else if (dataType == DataType::Float16)
|
||||
return "Float16";
|
||||
else
|
||||
LogicError("Unknown DataType.");
|
||||
}
|
||||
|
@ -94,6 +137,8 @@ namespace CNTK
|
|||
return sizeof(float);
|
||||
else if (dataType == DataType::Double)
|
||||
return sizeof(double);
|
||||
else if (dataType == DataType::Float16)
|
||||
return sizeof(float16);
|
||||
else
|
||||
LogicError("Unknown DataType.");
|
||||
}
|
||||
|
@ -621,8 +666,7 @@ namespace CNTK
|
|||
/// Construct a NDArrayView with newly allocated sparse storage in SparseCSC format on the specified 'device' and initialize its contents
|
||||
/// with the specified Sparse CSC format data.
|
||||
///
|
||||
template <typename ElementType>
|
||||
CNTK_API NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly = false);
|
||||
CNTK_API NDArrayView(::CNTK::DataType dataType, const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const void* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly = false);
|
||||
|
||||
///
|
||||
/// Construct a NDArrayView over newly allocated storage in the specified format on the specified 'device'.
|
||||
|
@ -707,6 +751,9 @@ namespace CNTK
|
|||
case DataType::Double:
|
||||
SetValue(value);
|
||||
break;
|
||||
case DataType::Float16:
|
||||
SetValue(float16::create(value));
|
||||
break;
|
||||
default:
|
||||
LogicError("Unsupported DataType %s.", DataTypeName(m_dataType));
|
||||
break;
|
||||
|
@ -801,6 +848,11 @@ namespace CNTK
|
|||
///
|
||||
CNTK_API void SetValue(double value);
|
||||
|
||||
///
|
||||
/// Fill 'this' NDArrayView with the specified value. The underlying DataType of 'this' view should be DataType::Double.
|
||||
///
|
||||
CNTK_API void SetValue(float16 value);
|
||||
|
||||
///
|
||||
/// Creates a new NDArrayView with newly allocated storage on the specified device and copies 'this' view's contents into the newly allocated view.
|
||||
///
|
||||
|
@ -876,6 +928,25 @@ namespace CNTK
|
|||
// Disallow copy and move construction and assignment
|
||||
NDArrayView(const NDArrayView&) = delete; NDArrayView& operator=(const NDArrayView&) = delete; NDArrayView& operator=(NDArrayView&&) = delete; NDArrayView(NDArrayView&& other) = delete;
|
||||
|
||||
// template functions connecting V1ElemType and ElementType
|
||||
template <typename ElementType, typename V1ElemType>
|
||||
const ElementType* _DataBuffer() const;
|
||||
|
||||
template <typename ElementType, typename V1ElemType>
|
||||
std::tuple<const ElementType *, const SparseIndexType*, const SparseIndexType*, size_t> _SparseCSCDataBuffers() const;
|
||||
|
||||
template <typename ElementType, typename V1ElemType>
|
||||
std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> _SparseBlockColumnDataBuffers() const;
|
||||
|
||||
template <typename ElementType, typename V1ElemType>
|
||||
static NDArrayViewPtr _RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device);
|
||||
|
||||
template <typename ElementType, typename V1ElemType>
|
||||
static NDArrayViewPtr _RandomUniform(const NDShape& shape, double rangeStart, double rangeEnd, unsigned long seed, const DeviceDescriptor& device);
|
||||
|
||||
template<typename ElementType, typename V1ElemType>
|
||||
ElementType _AsScalar() const;
|
||||
|
||||
private:
|
||||
static const size_t AutoSelectRowColSplitPoint = SIZE_MAX;
|
||||
|
||||
|
@ -891,6 +962,10 @@ namespace CNTK
|
|||
template <typename ElementType>
|
||||
std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetWritableMatrix(size_t rowColSplitPoint = AutoSelectRowColSplitPoint);
|
||||
|
||||
std::shared_ptr<const Microsoft::MSR::CNTK::MatrixBase> GetMatrixBase(size_t rowColSplitPoint = AutoSelectRowColSplitPoint) const;
|
||||
|
||||
std::shared_ptr<Microsoft::MSR::CNTK::MatrixBase> GetWritableMatrixBase(size_t rowColSplitPoint = AutoSelectRowColSplitPoint);
|
||||
|
||||
template <typename ElementType>
|
||||
const Microsoft::MSR::CNTK::TensorView<ElementType>* GetTensorView() const;
|
||||
|
||||
|
@ -2836,6 +2911,10 @@ namespace CNTK
|
|||
{
|
||||
CopyVariableValueToVector<double>(outputVariable, sequences);
|
||||
}
|
||||
else if (dataType == DataType::Float16)
|
||||
{
|
||||
CopyVariableValueToVector<float16>(outputVariable, sequences);
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
|
@ -6385,6 +6464,16 @@ namespace CNTK
|
|||
///
|
||||
CNTK_API FunctionPtr Crop(const Variable& nodeInput, const Variable& nodeReferent, const Variable& ancestorInput, const Variable& ancestorReferent, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Creates an instance of crop node with automatically computed crop offsets and specified ancestor nodes.
|
||||
/// This is used in cases when input nodes do not have common ancestor in the network.
|
||||
/// nodeInput: input node to be cropped.
|
||||
/// nodeReferent: input node which determines the spatial size of output.
|
||||
/// ancestorInput: ancestor of nodeInput.
|
||||
/// ancestorReferent: ancestor of nodeReferent which is treated as equal to ancestorInput for the purpose of computing crop offsets.
|
||||
///
|
||||
CNTK_API FunctionPtr Cast(const Variable& nodeInput, DataType outputType, const std::wstring& name = L"");
|
||||
|
||||
#endif // !CNTK_HEADERONLY_DEFINITIONS
|
||||
}
|
||||
|
||||
|
|
|
@ -41,6 +41,8 @@
|
|||
|
||||
// Forward declarations
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
struct MatrixBase;
|
||||
|
||||
template <typename ElemType>
|
||||
class Matrix;
|
||||
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
//This code is shared between V1 and V2
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace CNTK {
|
||||
|
||||
// Host functions for converting between FP32 and FP16 formats
|
||||
// TODO: use f16c instructions if available
|
||||
inline void float16ToFloat(const unsigned short* src, float* res)
|
||||
{
|
||||
unsigned h = *src;
|
||||
unsigned sign = ((h >> 15) & 1);
|
||||
unsigned exponent = ((h >> 10) & 0x1f);
|
||||
unsigned mantissa = ((h & 0x3ff) << 13);
|
||||
|
||||
if (exponent == 0x1f) { /* NaN or Inf */
|
||||
mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
|
||||
exponent = 0xff;
|
||||
}
|
||||
else if (!exponent) { /* Denorm or Zero */
|
||||
if (mantissa) {
|
||||
unsigned int msb;
|
||||
exponent = 0x71;
|
||||
do {
|
||||
msb = (mantissa & 0x400000);
|
||||
mantissa <<= 1; /* normalize */
|
||||
--exponent;
|
||||
} while (!msb);
|
||||
mantissa &= 0x7fffff; /* 1.mantissa is implicit */
|
||||
}
|
||||
}
|
||||
else {
|
||||
exponent += 0x70;
|
||||
}
|
||||
|
||||
*(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa);
|
||||
}
|
||||
|
||||
inline void floatToFloat16(float* src, unsigned short* dest)
|
||||
{
|
||||
unsigned x = *(unsigned*)src;
|
||||
unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
|
||||
unsigned short sign;
|
||||
unsigned exponent, mantissa;
|
||||
|
||||
// Get rid of +NaN/-NaN case first.
|
||||
if (u > 0x7f800000) {
|
||||
*dest = 0x7fffU;
|
||||
return;
|
||||
}
|
||||
|
||||
sign = (unsigned short)((x >> 16) & 0x8000);
|
||||
|
||||
// Get rid of +Inf/-Inf, +0/-0.
|
||||
if (u > 0x477fefff) {
|
||||
*dest = sign | 0x7c00U;
|
||||
return;
|
||||
}
|
||||
if (u < 0x33000001) {
|
||||
*dest = (sign | 0x0000);
|
||||
return;
|
||||
}
|
||||
|
||||
exponent = ((u >> 23) & 0xff);
|
||||
mantissa = (u & 0x7fffff);
|
||||
|
||||
if (exponent > 0x70) {
|
||||
shift = 13;
|
||||
exponent -= 0x70;
|
||||
}
|
||||
else {
|
||||
shift = 0x7e - exponent;
|
||||
exponent = 0;
|
||||
mantissa |= 0x800000;
|
||||
}
|
||||
lsb = (1 << shift);
|
||||
lsb_s1 = (lsb >> 1);
|
||||
lsb_m1 = (lsb - 1);
|
||||
|
||||
// Round to nearest even.
|
||||
remainder = (mantissa & lsb_m1);
|
||||
mantissa >>= shift;
|
||||
if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
|
||||
++mantissa;
|
||||
if (!(mantissa & 0x3ff)) {
|
||||
++exponent;
|
||||
mantissa = 0;
|
||||
}
|
||||
}
|
||||
|
||||
*dest = (sign | (unsigned short)((exponent << 10) | mantissa));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
@ -40,7 +40,6 @@
|
|||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
|
||||
|
@ -161,6 +160,7 @@
|
|||
<ClInclude Include="API\CNTKLibraryC.h" />
|
||||
<ClInclude Include="API\CNTKLibraryExperimental.h" />
|
||||
<ClInclude Include="API\CNTKLibraryInternals.h" />
|
||||
<ClInclude Include="API\HalfConverter.hpp" />
|
||||
<ClInclude Include="BackCompat.h" />
|
||||
<ClInclude Include="BlockFunction.h" />
|
||||
<ClInclude Include="CompositeFunction.h" />
|
||||
|
|
|
@ -177,6 +177,9 @@
|
|||
<ClInclude Include="proto\onnx\core\utils.h">
|
||||
<Filter>proto\onnx\core</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="API\HalfConverter.hpp">
|
||||
<Filter>API</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="proto\onnx\core\record.h">
|
||||
<Filter>proto\onnx\core</Filter>
|
||||
</ClInclude>
|
||||
|
|
|
@ -966,7 +966,7 @@ namespace CNTK
|
|||
#endif
|
||||
#ifdef _CUDA_PATH_
|
||||
int cudaVersion = 0;
|
||||
if (cudaDriverGetVersion(&cudaVersion) == cudaSuccess)
|
||||
if (cudaRuntimeGetVersion(&cudaVersion) == cudaSuccess)
|
||||
{
|
||||
int major = 0, minor = 0, patchLevel = 0;
|
||||
ExtractCUDAVersion(cudaVersion, major, minor, patchLevel);
|
||||
|
|
|
@ -407,6 +407,36 @@ namespace CNTK
|
|||
}
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
/*static*/ Microsoft::MSR::CNTK::ComputationNodeBasePtr CompositeFunction::CreateLearnableParameterFromVariable(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, const NDShape& shape, const std::wstring& name)
|
||||
{
|
||||
switch (variable.GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
return builder.template TypedCreateLearnableParameter<float>(name, AsTensorShape(shape));
|
||||
case DataType::Double:
|
||||
return builder.template TypedCreateLearnableParameter<double>(name, AsTensorShape(shape));
|
||||
case DataType::Float16:
|
||||
return builder.template TypedCreateLearnableParameter<half>(name, AsTensorShape(shape));
|
||||
default:
|
||||
return builder.CreateLearnableParameter(name, AsTensorShape(shape));
|
||||
}
|
||||
}
|
||||
|
||||
/*static*/ void CompositeFunction::CastAssignNodeValue(ComputationNodeBasePtr node, DataType dataType, std::shared_ptr<const MatrixBase> matrix)
|
||||
{
|
||||
switch (dataType)
|
||||
{
|
||||
case DataType::Float:
|
||||
return (dynamic_cast<ComputationNode<float>*>(&*node))->Value().CastAssignValuesOf(*matrix);
|
||||
case DataType::Double:
|
||||
return (dynamic_cast<ComputationNode<double>*>(&*node))->Value().CastAssignValuesOf(*matrix);
|
||||
case DataType::Float16:
|
||||
return (dynamic_cast<ComputationNode<half>*>(&*node))->Value().CastAssignValuesOf(*matrix);
|
||||
default:
|
||||
LogicError("Unsupported data type");
|
||||
}
|
||||
}
|
||||
|
||||
// Recursively create a sub-network of ComputationNode instances corresponding to the graph of Functions
|
||||
// underlying the specified 'variable' and return the ComputationNode instance that corresponds to the
|
||||
|
@ -441,33 +471,77 @@ namespace CNTK
|
|||
// Lets add a null entry in the map for this variable, to break infinite recursion when processing recurrent graphs
|
||||
variableToNodeMap[variable] = nullptr;
|
||||
|
||||
std::shared_ptr<ComputationNode<ElementType>> computationNodePtr;
|
||||
std::shared_ptr<ComputationNodeBase> computationNodePtr;
|
||||
auto internalNodeName = CNTKInternalNodeNameFromUidAndName(variable.Uid(), variable.Name(), useMangledNamesForComputationNodes);
|
||||
if (variable.IsParameter() || variable.IsConstant())
|
||||
{
|
||||
if (variable.Shape().HasInferredDimension())
|
||||
InvalidArgument("Parameter or Constant '%S' with unresolved shape %S found when compiling the Function graph.", variable.AsString().c_str(), variable.Shape().AsString().c_str());
|
||||
|
||||
computationNodePtr = builder.CreateLearnableParameter(internalNodeName, AsTensorShape(variable.Shape()));
|
||||
computationNodePtr = CreateLearnableParameterFromVariable(variable, builder, variable.Shape(), internalNodeName);
|
||||
network->InitLearnableParameters(computationNodePtr, L"fixedValue", 0); // must call this to follow protocol; can overwrite later
|
||||
if (!variable.NeedsGradient() || (inputsToExcludeGradientsFor.find(variable) != inputsToExcludeGradientsFor.end()))
|
||||
computationNodePtr->SetLearningRateMultiplier(0.0);
|
||||
|
||||
NDArrayViewPtr value = variable.IsConstant() ? Constant(variable).Value() : Parameter(variable).Value();
|
||||
std::shared_ptr<const Matrix<ElementType>> valueMatrix = variable.IsConstant() ? value->GetMatrix<ElementType>() : value->GetWritableMatrix<ElementType>();
|
||||
std::shared_ptr<const MatrixBase> valueMatrix = variable.IsConstant() ? value->GetMatrixBase() : value->GetWritableMatrixBase();
|
||||
|
||||
if (variable.IsParameter() || (valueMatrix->GetDeviceId() == network->GetDeviceId()))
|
||||
computationNodePtr->Value() = valueMatrix->AsReference();
|
||||
{
|
||||
// shallow copy from parameter value to computation node value to link them together
|
||||
switch (variable.GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
std::dynamic_pointer_cast<ComputationNode<float>>(computationNodePtr)->Value() = std::dynamic_pointer_cast<const Matrix<float>>(valueMatrix)->AsReference();
|
||||
break;
|
||||
case DataType::Double:
|
||||
std::dynamic_pointer_cast<ComputationNode<double>>(computationNodePtr)->Value() = std::dynamic_pointer_cast<const Matrix<double>>(valueMatrix)->AsReference();
|
||||
break;
|
||||
case DataType::Float16:
|
||||
std::dynamic_pointer_cast<ComputationNode<half>>(computationNodePtr)->Value() = std::dynamic_pointer_cast<const Matrix<half>>(valueMatrix)->AsReference();
|
||||
break;
|
||||
default:
|
||||
LogicError("Unsupported data type");
|
||||
}
|
||||
}
|
||||
else // Constant: if initialized data lives on wrong device, make a copy to the right one (copy is OK since it's constant)
|
||||
{
|
||||
// TODO: the following two lines are a workaround for a bug in the Math library
|
||||
// (AssignValuesOf throws when source and destination matrices reside on different GPU devices).
|
||||
// Once this bug is fixed, change to
|
||||
// Matrix<ElementType> clonedMatrix(valueMatrix->GetNumRows(), valueMatrix->GetNumCols(), network->GetDeviceId(), valueMatrix->GetMatrixType(), valueMatrix->GetFormat());
|
||||
Matrix<ElementType> clonedMatrix(network->GetDeviceId());
|
||||
clonedMatrix.SwitchToMatrixType(valueMatrix->GetMatrixType(), valueMatrix->GetFormat(), false);
|
||||
clonedMatrix.AssignValuesOf(*valueMatrix);
|
||||
computationNodePtr->Value() = std::move(clonedMatrix);
|
||||
switch (variable.GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
{
|
||||
Matrix<float>& nodeValue = dynamic_cast<ComputationNode<float>*>(&*computationNodePtr)->Value();
|
||||
Matrix<float> clonedMatrix(nodeValue.GetNumRows(), nodeValue.GetNumCols(), valueMatrix->GetDeviceId(), nodeValue.GetMatrixType(), nodeValue.GetFormat());
|
||||
clonedMatrix.CastAssignValuesOf(*valueMatrix);
|
||||
clonedMatrix.TransferToDeviceIfNotThere(network->GetDeviceId(), true);
|
||||
nodeValue = std::move(clonedMatrix);
|
||||
break;
|
||||
}
|
||||
case DataType::Double:
|
||||
{
|
||||
Matrix<double>& nodeValue = dynamic_cast<ComputationNode<double>*>(&*computationNodePtr)->Value();
|
||||
Matrix<double> clonedMatrix(nodeValue.GetNumRows(), nodeValue.GetNumCols(), valueMatrix->GetDeviceId(), nodeValue.GetMatrixType(), nodeValue.GetFormat());
|
||||
clonedMatrix.CastAssignValuesOf(*valueMatrix);
|
||||
clonedMatrix.TransferToDeviceIfNotThere(network->GetDeviceId(), true);
|
||||
nodeValue = std::move(clonedMatrix);
|
||||
break;
|
||||
}
|
||||
case DataType::Float16:
|
||||
{
|
||||
Matrix<half>& nodeValue = dynamic_cast<ComputationNode<half>*>(&*computationNodePtr)->Value();
|
||||
Matrix<half> clonedMatrix(nodeValue.GetNumRows(), nodeValue.GetNumCols(), valueMatrix->GetDeviceId(), nodeValue.GetMatrixType(), nodeValue.GetFormat());
|
||||
clonedMatrix.CastAssignValuesOf(*valueMatrix);
|
||||
clonedMatrix.TransferToDeviceIfNotThere(network->GetDeviceId(), true);
|
||||
nodeValue = std::move(clonedMatrix);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogicError("Unsupported data type");
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (variable.IsInput())
|
||||
|
@ -515,7 +589,7 @@ namespace CNTK
|
|||
}
|
||||
else
|
||||
{
|
||||
computationNodePtr = builder.CreateLearnableParameter(internalNodeName, AsTensorShape(fullyDefinedArgumentVar.Shape()));
|
||||
computationNodePtr = CreateLearnableParameterFromVariable(variable, builder, fullyDefinedArgumentVar.Shape(), internalNodeName);
|
||||
network->InitLearnableParameters(computationNodePtr, L"fixedValue", 0); // must call this to follow protocol; can overwrite later
|
||||
if (!variable.NeedsGradient() || (inputsToExcludeGradientsFor.find(variable) != inputsToExcludeGradientsFor.end()))
|
||||
computationNodePtr->SetLearningRateMultiplier(0.0);
|
||||
|
@ -538,7 +612,7 @@ namespace CNTK
|
|||
// Can be null in case of loops with f.output == f.input.
|
||||
// Such loops cannot be handled, so we leave nullptr as computational node.
|
||||
if (outputVariableNode)
|
||||
computationNodePtr = outputVariableNode->template As<ComputationNode<ElementType>>()->shared_from_this();
|
||||
computationNodePtr = outputVariableNode->template As<ComputationNodeBase>()->shared_from_this();
|
||||
else
|
||||
computationNodePtr = nullptr;
|
||||
}
|
||||
|
@ -592,7 +666,7 @@ namespace CNTK
|
|||
template<typename ElementType>
|
||||
/*static*/ ComputationNodeBasePtr CompositeFunction::CreateComputationNode(const Variable& variable,
|
||||
Function* function,
|
||||
const std::vector<std::shared_ptr<ComputationNode<ElementType>>>& inputNodes,
|
||||
const std::vector<std::shared_ptr<ComputationNodeBase>>& inputNodes,
|
||||
Microsoft::MSR::CNTK::ComputationNetworkPtr& network,
|
||||
std::unordered_map<Variable, ComputationNodeBasePtr>& variableToNodeMap,
|
||||
bool useMangledNamesForComputationNodes)
|
||||
|
@ -609,6 +683,40 @@ namespace CNTK
|
|||
for (auto inputNode : inputNodes)
|
||||
inputNodesBasePtrs.push_back(inputNode);
|
||||
|
||||
// take the dataType from the first input, if not specified (i.e. placeholder) then use default
|
||||
// node like BatchNormalization may have inputs with different precision,
|
||||
// and that validation is done in specific node constructor
|
||||
DataType inputNodeType = AsDataType<ElementType>();
|
||||
if (inputNodes.size() > 0)
|
||||
{
|
||||
if (std::dynamic_pointer_cast<ComputationNode<float>, ComputationNodeBase>(inputNodes[0]))
|
||||
inputNodeType = DataType::Float;
|
||||
else if (std::dynamic_pointer_cast<ComputationNode<double>, ComputationNodeBase>(inputNodes[0]))
|
||||
inputNodeType = DataType::Double;
|
||||
else if (std::dynamic_pointer_cast<ComputationNode<half>, ComputationNodeBase>(inputNodes[0]))
|
||||
inputNodeType = DataType::Float16;
|
||||
}
|
||||
|
||||
#define ASSIGN_NEW_NODE(nodeClass, ...) \
|
||||
do { \
|
||||
if (inputNodeType == DataType::Float) \
|
||||
computationNodePtr = New<nodeClass<float>>(__VA_ARGS__); \
|
||||
else if (inputNodeType == DataType::Double) \
|
||||
computationNodePtr = New<nodeClass<double>>(__VA_ARGS__); \
|
||||
else if (inputNodeType == DataType::Float16) \
|
||||
computationNodePtr = New<nodeClass<half>>(__VA_ARGS__); \
|
||||
} while(0)
|
||||
|
||||
#define ASSIGN_NEW_NODE2(nodeClass, dtype, ...) \
|
||||
do { \
|
||||
if (inputNodeType == DataType::Float) \
|
||||
computationNodePtr = New<nodeClass<dtype, float>>(__VA_ARGS__); \
|
||||
else if (inputNodeType == DataType::Double) \
|
||||
computationNodePtr = New<nodeClass<dtype, double>>(__VA_ARGS__); \
|
||||
else if (inputNodeType == DataType::Float16) \
|
||||
computationNodePtr = New<nodeClass<dtype, half>>(__VA_ARGS__); \
|
||||
} while(0)
|
||||
|
||||
auto outputs = function->RawOutputs();
|
||||
if (variable == outputs[0])
|
||||
{
|
||||
|
@ -621,76 +729,76 @@ namespace CNTK
|
|||
switch (op)
|
||||
{
|
||||
case PrimitiveOpType::Negate:
|
||||
computationNodePtr = New<NegateNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(NegateNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Sigmoid:
|
||||
computationNodePtr = New<SigmoidNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(SigmoidNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Atanh:
|
||||
computationNodePtr = New<AtanhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(AtanhNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Tanh:
|
||||
computationNodePtr = New<TanhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(TanhNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Acos:
|
||||
computationNodePtr = New<AcosNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(AcosNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Cos:
|
||||
computationNodePtr = New<CosineNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(CosineNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Asin:
|
||||
computationNodePtr = New<AsinNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(AsinNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Sin:
|
||||
computationNodePtr = New<SinNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(SinNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Cosh:
|
||||
computationNodePtr = New<CoshNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(CoshNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Asinh:
|
||||
computationNodePtr = New<AsinhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(AsinhNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Sinh:
|
||||
computationNodePtr = New<SinhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(SinhNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::ReLU:
|
||||
computationNodePtr = New<RectifiedLinearNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(RectifiedLinearNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Exp:
|
||||
computationNodePtr = New<ExpNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(ExpNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Log:
|
||||
computationNodePtr = New<LogNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(LogNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Sqrt:
|
||||
computationNodePtr = New<SqrtNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(SqrtNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::ELU:
|
||||
computationNodePtr = New<ExponentialLinearUnitNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(ExponentialLinearUnitNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Floor:
|
||||
computationNodePtr = New<FloorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(FloorNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Abs:
|
||||
computationNodePtr = New<AbsNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(AbsNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Reciprocal:
|
||||
computationNodePtr = New<ReciprocalNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(ReciprocalNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Softmax:
|
||||
computationNodePtr = New<SoftmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(SoftmaxNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Hardmax:
|
||||
computationNodePtr = New<HardmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(HardmaxNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::TopK:
|
||||
{
|
||||
auto k = functionConfig[PrimitiveFunction::AttributeNameNumItems].Value<size_t>();
|
||||
computationNodePtr = New<TopKNode<ElementType>>(network->GetDeviceId(), internalNodeName, k);
|
||||
ASSIGN_NEW_NODE(TopKNode, network->GetDeviceId(), internalNodeName, k);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::StableSigmoid:
|
||||
computationNodePtr = New<StableSigmoidNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(StableSigmoidNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::TransposeAxes:
|
||||
{
|
||||
|
@ -699,7 +807,7 @@ namespace CNTK
|
|||
auto perm = AsVector<Axis>(functionConfig[PrimitiveFunction::AttributeNameAxisVec].Value<std::vector<DictionaryValue>>());
|
||||
for (auto& p : perm)
|
||||
p = NormalizeStaticAxis(p, perm.size());
|
||||
computationNodePtr = New<TransposeDimensionsNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(perm));
|
||||
ASSIGN_NEW_NODE(TransposeDimensionsNode, network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(perm));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -707,7 +815,7 @@ namespace CNTK
|
|||
auto axis2 = functionConfig[PrimitiveFunction::AttributeNameAxis2].Value<Axis>();
|
||||
|
||||
// The axis ids passed to the internal CNTK TransposeDimensionsNode are 1 based instead of 0 based
|
||||
computationNodePtr = New<TransposeDimensionsNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(axis1), AsCNTKInternalAxisIdx(axis2));
|
||||
ASSIGN_NEW_NODE(TransposeDimensionsNode, network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(axis1), AsCNTKInternalAxisIdx(axis2));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -715,24 +823,24 @@ namespace CNTK
|
|||
{
|
||||
auto dynamicAxes = variable.DynamicAxes();
|
||||
auto internalCNTKWhereNodeDynamicAxisName = InternalDynamicAxisNameFromDynamicAxes(dynamicAxes);
|
||||
computationNodePtr = New<WhereNode<ElementType>>(network->GetDeviceId(), internalNodeName, internalCNTKWhereNodeDynamicAxisName);
|
||||
ASSIGN_NEW_NODE(WhereNode, network->GetDeviceId(), internalNodeName, internalCNTKWhereNodeDynamicAxisName);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::ToSequence:
|
||||
{
|
||||
auto dynamicAxes = variable.DynamicAxes();
|
||||
auto internalCNTKDynamicAxisName = InternalDynamicAxisNameFromDynamicAxes(dynamicAxes);
|
||||
computationNodePtr = New<ToSequenceNode<ElementType>>(network->GetDeviceId(), internalNodeName, internalCNTKDynamicAxisName);
|
||||
ASSIGN_NEW_NODE(ToSequenceNode, network->GetDeviceId(), internalNodeName, internalCNTKDynamicAxisName);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::ToSequenceLike:
|
||||
computationNodePtr = New<ToSequenceLikeNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(ToSequenceLikeNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::UnpackSequence:
|
||||
{
|
||||
auto paddingValue = functionConfig[PrimitiveFunction::AttributeNameSequenceUnpackPaddingValue].Value<double>();
|
||||
auto suppressMaskOutput = functionConfig[PrimitiveFunction::AttributeNameSequenceUnpackSuppressMaskOutput].Value<bool>();
|
||||
computationNodePtr = New<UnpackSequenceNode<ElementType>>(network->GetDeviceId(), internalNodeName, (ElementType)paddingValue, suppressMaskOutput);
|
||||
ASSIGN_NEW_NODE(UnpackSequenceNode, network->GetDeviceId(), internalNodeName, paddingValue, suppressMaskOutput);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Slice:
|
||||
|
@ -768,28 +876,28 @@ namespace CNTK
|
|||
RuntimeError("Failed to create computation node: Slice operation with inconsistent attributes");
|
||||
}
|
||||
// Internal CNTK SliceNode takes 1 based axis indices instead of 0 based
|
||||
computationNodePtr = New<SliceNode<ElementType>>(network->GetDeviceId(), internalNodeName, beginIndex, endIndex, AsCNTKInternalAxisIdx(axis), strides);
|
||||
ASSIGN_NEW_NODE(SliceNode, network->GetDeviceId(), internalNodeName, beginIndex, endIndex, AsCNTKInternalAxisIdx(axis), strides);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::RandomSample:
|
||||
{
|
||||
auto numSamples = functionConfig[PrimitiveFunction::AttributeNameNumSamples].Value<size_t>();
|
||||
auto allowDuplicates = functionConfig[PrimitiveFunction::AttributeNameAllowDuplicates].Value<bool>();
|
||||
computationNodePtr = New<RandomSampleNode<ElementType>>(network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
|
||||
ASSIGN_NEW_NODE(RandomSampleNode, network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::RandomSampleInclusionFrequency:
|
||||
{
|
||||
auto numSamples = functionConfig[PrimitiveFunction::AttributeNameNumSamples].Value<size_t>();
|
||||
auto allowDuplicates = functionConfig[PrimitiveFunction::AttributeNameAllowDuplicates].Value<bool>();
|
||||
computationNodePtr = New<RandomSampleInclusionFrequencyNode<ElementType>>(network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
|
||||
ASSIGN_NEW_NODE(RandomSampleInclusionFrequencyNode, network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Dropout:
|
||||
{
|
||||
auto dropoutRate = functionConfig[PrimitiveFunction::AttributeNameDropoutRate].Value<double>();
|
||||
computationNodePtr = New<DropoutNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
computationNodePtr->As<DropoutNode<ElementType>>()->SetDropoutRate(dropoutRate);
|
||||
ASSIGN_NEW_NODE(DropoutNode, network->GetDeviceId(), internalNodeName);
|
||||
SMART_NODE_INVOKE(DropoutNode, computationNodePtr, SetDropoutRate, dropoutRate);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::RandomDistribution:
|
||||
|
@ -805,11 +913,11 @@ namespace CNTK
|
|||
if (functionConfig.Contains(PrimitiveFunction::AttributeNameNewShape))
|
||||
{
|
||||
auto shape = functionConfig[PrimitiveFunction::AttributeNameNewShape].Value<NDShape>();
|
||||
computationNodePtr = New<RandomDistributionNode<ElementType>>(network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs, AsTensorShape(shape));
|
||||
ASSIGN_NEW_NODE(RandomDistributionNode, network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs, AsTensorShape(shape));
|
||||
}
|
||||
else
|
||||
computationNodePtr = New<RandomDistributionNode<ElementType>>(network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs);
|
||||
computationNodePtr->As<RandomDistributionNode<ElementType>>()->SetRngState(seed, offset);
|
||||
ASSIGN_NEW_NODE(RandomDistributionNode, network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs);
|
||||
SMART_NODE_INVOKE(RandomDistributionNode, computationNodePtr, SetRngState, seed, offset);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Reshape:
|
||||
|
@ -829,7 +937,7 @@ namespace CNTK
|
|||
replacementShape[i] = 0;
|
||||
}
|
||||
|
||||
computationNodePtr = New<ReshapeNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(replacementShape), AsCNTKInternalAxisIdx(beginAxis), AsCNTKInternalAxisIdx(endAxis));
|
||||
ASSIGN_NEW_NODE(ReshapeNode, network->GetDeviceId(), internalNodeName, AsTensorShape(replacementShape), AsCNTKInternalAxisIdx(beginAxis), AsCNTKInternalAxisIdx(endAxis));
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Squeeze:
|
||||
|
@ -853,7 +961,7 @@ namespace CNTK
|
|||
PoolingType poolingType = (PoolingType)(functionConfig[PrimitiveFunction::AttributeNamePoolingType].Value<size_t>());
|
||||
auto roiOutputShape = functionConfig[PrimitiveFunction::AttributeNameROIOutputShape].Value<NDShape>();
|
||||
auto spatialScale = functionConfig[PrimitiveFunction::AttributeNameSpatialScale].Value<double>();
|
||||
computationNodePtr = New<ROIPoolingNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(roiOutputShape), spatialScale);
|
||||
ASSIGN_NEW_NODE(ROIPoolingNode, network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(roiOutputShape), spatialScale);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Pooling:
|
||||
|
@ -874,7 +982,7 @@ namespace CNTK
|
|||
{
|
||||
includePad = functionConfig[PrimitiveFunction::AttributeNameIncludePad].Value<bool>();
|
||||
}
|
||||
computationNodePtr = New<PoolingNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ceilOutDim, includePad, ImageLayoutKind::CHW);
|
||||
ASSIGN_NEW_NODE(PoolingNode, network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ceilOutDim, includePad, ImageLayoutKind::CHW);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Unpooling:
|
||||
|
@ -885,77 +993,77 @@ namespace CNTK
|
|||
auto upperPad = functionConfig[PrimitiveFunction::AttributeNameUpperPad].Value<NDShape>();
|
||||
auto autoPadding = AsVector<bool>(functionConfig[PrimitiveFunction::AttributeNameAutoPadding].Value<std::vector<DictionaryValue>>());
|
||||
//We only get here after validation so it is safe to assume unpooling is max
|
||||
computationNodePtr = New<MaxUnpoolingNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(unpoolingWindowShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ImageLayoutKind::CHW);
|
||||
ASSIGN_NEW_NODE(MaxUnpoolingNode, network->GetDeviceId(), internalNodeName, AsTensorShape(unpoolingWindowShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ImageLayoutKind::CHW);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::SumAll:
|
||||
computationNodePtr = New<SumElementsNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(SumElementsNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::OneHot:
|
||||
{
|
||||
auto numClass = functionConfig[PrimitiveFunction::AttributeNameNumClass].Value<size_t>();
|
||||
auto is_sparse = functionConfig[PrimitiveFunction::AttributeNameOneHotOutputSparse].Value<bool>();
|
||||
auto axis = functionConfig[PrimitiveFunction::AttributeNameOneHotAxis].Value<Axis>();
|
||||
computationNodePtr = New<OneHotNode<ElementType>>(network->GetDeviceId(), numClass, is_sparse, axis.StaticAxisIndex(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(OneHotNode, network->GetDeviceId(), numClass, is_sparse, axis.StaticAxisIndex(), internalNodeName);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Gather:
|
||||
computationNodePtr = New<GatherNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(GatherNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::ToBatch:
|
||||
{
|
||||
computationNodePtr = New<ToBatchAxisNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(ToBatchAxisNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::UnpackBatch:
|
||||
{
|
||||
computationNodePtr = New<UnpackBatchAxisNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(UnpackBatchAxisNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Plus:
|
||||
computationNodePtr = New<PlusNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(PlusNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::LogPlus:
|
||||
computationNodePtr = New<LogPlusNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(LogPlusNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Pow:
|
||||
computationNodePtr = New<PowNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(PowNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Minus:
|
||||
computationNodePtr = New<MinusNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(MinusNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::ElementTimes:
|
||||
computationNodePtr = New<ElementTimesNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(ElementTimesNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Equal:
|
||||
computationNodePtr = New<EqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(EqualNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::NotEqual:
|
||||
computationNodePtr = New<NotEqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(NotEqualNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Less:
|
||||
computationNodePtr = New<LessNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(LessNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::LessEqual:
|
||||
computationNodePtr = New<LessEqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(LessEqualNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Greater:
|
||||
computationNodePtr = New<GreaterNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(GreaterNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::GreaterEqual:
|
||||
computationNodePtr = New<GreaterEqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(GreaterEqualNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Times:
|
||||
{
|
||||
size_t outputRank = functionConfig[PrimitiveFunction::AttributeNameOutputRank].Value<size_t>();
|
||||
auto inferInputRankToMap = functionConfig[PrimitiveFunction::AttributeNameInferInputRankToMap].Value<int>();
|
||||
computationNodePtr = New<TimesNode<ElementType>>(network->GetDeviceId(), internalNodeName, outputRank, inferInputRankToMap);
|
||||
ASSIGN_NEW_NODE(TimesNode, network->GetDeviceId(), internalNodeName, outputRank, inferInputRankToMap);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::TransposeTimes:
|
||||
{
|
||||
size_t outputRank = functionConfig[PrimitiveFunction::AttributeNameOutputRank].Value<size_t>();
|
||||
computationNodePtr = New<TransposeTimesNode<ElementType>>(network->GetDeviceId(), internalNodeName, outputRank);
|
||||
ASSIGN_NEW_NODE(TransposeTimesNode, network->GetDeviceId(), internalNodeName, outputRank);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Convolution:
|
||||
|
@ -975,7 +1083,7 @@ namespace CNTK
|
|||
if (functionConfig.Contains(PrimitiveFunction::AttributeNameOutputShape))
|
||||
outputShape = functionConfig[PrimitiveFunction::AttributeNameOutputShape].Value<NDShape>();
|
||||
auto maxTempMemSizeInSamples = functionConfig[PrimitiveFunction::AttributeNameMaxTempMemSizeInSamples].Value<size_t>();
|
||||
computationNodePtr = New<ConvolutionNode<ElementType>>(network->GetDeviceId(), internalNodeName,
|
||||
ASSIGN_NEW_NODE(ConvolutionNode, network->GetDeviceId(), internalNodeName,
|
||||
AsTensorShape(kernelShape), AsTensorShape(outputMapCount), AsTensorShape(strides),
|
||||
sharing, autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), transpose,
|
||||
outputShape.IsUnknown() ? TensorShape(0) : AsTensorShape(outputShape),
|
||||
|
@ -983,22 +1091,22 @@ namespace CNTK
|
|||
break;
|
||||
}
|
||||
case PrimitiveOpType::CosDistance:
|
||||
computationNodePtr = New<CosDistanceNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(CosDistanceNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::CosDistanceWithNegativeSamples:
|
||||
computationNodePtr = New<CosDistanceWithNegativeSamplesNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(CosDistanceWithNegativeSamplesNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Logistic:
|
||||
computationNodePtr = New<LogisticNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(LogisticNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::SquaredError:
|
||||
computationNodePtr = New<SquareErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(SquareErrorNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::CrossEntropyWithSoftmax:
|
||||
computationNodePtr = New<CrossEntropyWithSoftmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(CrossEntropyWithSoftmaxNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::ClassificationError:
|
||||
computationNodePtr = New<ClassificationErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(ClassificationErrorNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::EditDistanceError:
|
||||
{
|
||||
|
@ -1007,7 +1115,7 @@ namespace CNTK
|
|||
auto insPen = functionConfig[PrimitiveFunction::AttributeNameInsertionPenalty].Value<float>();
|
||||
auto squashInputs = functionConfig[PrimitiveFunction::AttributeNameSquashInputs].Value<bool>();
|
||||
auto tokensToIgnore = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNameTokensToIgnore].Value<std::vector<DictionaryValue>>());
|
||||
computationNodePtr = New<EditDistanceErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore);
|
||||
ASSIGN_NEW_NODE(EditDistanceErrorNode, network->GetDeviceId(), internalNodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::LatticeSequenceWithSoftmax:
|
||||
|
@ -1033,14 +1141,14 @@ namespace CNTK
|
|||
{
|
||||
auto delayContraint = functionConfig[PrimitiveFunction::AttributeNameDelayConstraint].Value<int>();
|
||||
auto blankTokenId = functionConfig[PrimitiveFunction::AttributeNameBlankTokenId].Value<size_t>();
|
||||
computationNodePtr = New<ForwardBackwardNode<ElementType>>(network->GetDeviceId(), internalNodeName, blankTokenId, delayContraint);
|
||||
ASSIGN_NEW_NODE(ForwardBackwardNode, network->GetDeviceId(), internalNodeName, blankTokenId, delayContraint);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::LambdaRank:
|
||||
computationNodePtr = New<LambdaRankNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(LambdaRankNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::NDCG:
|
||||
computationNodePtr = New<NDCG1EvalNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(NDCG1EvalNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::PastValue:
|
||||
case PrimitiveOpType::FutureValue:
|
||||
|
@ -1050,9 +1158,9 @@ namespace CNTK
|
|||
|
||||
size_t offset = primitiveFunction->Attributes()[PrimitiveFunction::AttributeNameOffset].Value<size_t>();
|
||||
if (op == PrimitiveOpType::PastValue)
|
||||
computationNodePtr = New<PastValueNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);
|
||||
ASSIGN_NEW_NODE(PastValueNode, network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);
|
||||
else
|
||||
computationNodePtr = New<FutureValueNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);
|
||||
ASSIGN_NEW_NODE(FutureValueNode, network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);
|
||||
|
||||
break;
|
||||
}
|
||||
|
@ -1080,7 +1188,7 @@ namespace CNTK
|
|||
);
|
||||
|
||||
}
|
||||
computationNodePtr = New<ReduceElementsNode<ElementType>>(network->GetDeviceId(), internalNodeName, reductionOpName, AsCNTKInternalAxisIdx(reductionAxis), keepDimensions);
|
||||
ASSIGN_NEW_NODE(ReduceElementsNode, network->GetDeviceId(), internalNodeName, reductionOpName, AsCNTKInternalAxisIdx(reductionAxis), keepDimensions);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::BatchNormalization:
|
||||
|
@ -1097,7 +1205,7 @@ namespace CNTK
|
|||
disableRegularization = functionConfig[PrimitiveFunction::AttributeNameDisableRegularization].Value<bool>();
|
||||
}
|
||||
|
||||
computationNodePtr = New<BatchNormalizationNode<ElementType>>(network->GetDeviceId(), internalNodeName, spatial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, disableRegularization, ImageLayoutKind::CHW);
|
||||
ASSIGN_NEW_NODE(BatchNormalizationNode, network->GetDeviceId(), internalNodeName, spatial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, disableRegularization, ImageLayoutKind::CHW);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Combine:
|
||||
|
@ -1106,24 +1214,24 @@ namespace CNTK
|
|||
computationNodePtr = variableToNodeMap[variable];
|
||||
break;
|
||||
case PrimitiveOpType::PackedIndex:
|
||||
computationNodePtr = New<PackedIndexNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(PackedIndexNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::GatherPacked:
|
||||
computationNodePtr = New<GatherPackedNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(GatherPackedNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::ScatterPacked:
|
||||
computationNodePtr = New<ScatterPackedNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(ScatterPackedNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Clip:
|
||||
computationNodePtr = New<ClipNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(ClipNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Select:
|
||||
computationNodePtr = New<IfNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(IfNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Splice:
|
||||
{
|
||||
Axis spliceAxis = functionConfig[PrimitiveFunction::AttributeNameAxis].Value<Axis>();
|
||||
computationNodePtr = New<RowStackNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(spliceAxis));
|
||||
ASSIGN_NEW_NODE(RowStackNode, network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(spliceAxis));
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Pad:
|
||||
|
@ -1132,7 +1240,7 @@ namespace CNTK
|
|||
auto foot = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNamePaddingFoot].Value<std::vector<DictionaryValue>>());
|
||||
auto mode = functionConfig[PrimitiveFunction::AttributeNamePaddingMode].Value<size_t>();
|
||||
auto constantValue = functionConfig[PrimitiveFunction::AttributeNamePaddingConstantValue].Value<double>();
|
||||
computationNodePtr = New<PaddingNode<ElementType>>(network->GetDeviceId(), internalNodeName, head, foot, (PaddingType)mode, (ElementType)constantValue);
|
||||
ASSIGN_NEW_NODE(PaddingNode, network->GetDeviceId(), internalNodeName, head, foot, (PaddingType)mode, constantValue);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::OptimizedRNNStack:
|
||||
|
@ -1142,31 +1250,31 @@ namespace CNTK
|
|||
auto hiddenSize = functionConfig[PrimitiveFunction::AttributeNameHiddenSize].Value<size_t>();
|
||||
auto recurrentOp = functionConfig[PrimitiveFunction::AttributeNameRecurrentOp].Value<std::wstring>();
|
||||
|
||||
computationNodePtr = New<OptimizedRNNStackNode<ElementType>>(network->GetDeviceId(), internalNodeName, bidirectional, numLayers, hiddenSize, recurrentOp);
|
||||
ASSIGN_NEW_NODE(OptimizedRNNStackNode, network->GetDeviceId(), internalNodeName, bidirectional, numLayers, hiddenSize, recurrentOp);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::ReconcileDynamicAxis:
|
||||
{
|
||||
computationNodePtr = New<ReconcileDynamicAxisNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(ReconcileDynamicAxisNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::LogSoftmax:
|
||||
{
|
||||
//This can be implemented as x => x - ReduceLogSum(x). How to do this here?
|
||||
computationNodePtr = New<LogSoftmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(LogSoftmaxNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Pass:
|
||||
computationNodePtr = New<PassNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(PassNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::LabelsToGraph:
|
||||
computationNodePtr = New<LabelsToGraphNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(LabelsToGraphNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::StopGradient:
|
||||
computationNodePtr = New<StopGradientNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(StopGradientNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Assign:
|
||||
computationNodePtr = New<AssignNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(AssignNode, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::Crop:
|
||||
if (functionInputs.size() == 2)
|
||||
|
@ -1179,24 +1287,41 @@ namespace CNTK
|
|||
{
|
||||
CNTK::LogicError("Vector of crop offsets must have size 2.");
|
||||
}
|
||||
computationNodePtr = New<CropNode<ElementType>>(offsets[0], offsets[1], network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(CropNode, offsets[0], offsets[1], network->GetDeviceId(), internalNodeName);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Crop with two inputs and automatic offset computation.
|
||||
computationNodePtr = New<CropNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(CropNode, network->GetDeviceId(), internalNodeName);
|
||||
}
|
||||
}
|
||||
else if (functionInputs.size() == 4)
|
||||
{
|
||||
// Crop with four inputs and automatic offset computation.
|
||||
computationNodePtr = New<CropNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
ASSIGN_NEW_NODE(CropNode, network->GetDeviceId(), internalNodeName);
|
||||
}
|
||||
else
|
||||
{
|
||||
CNTK::LogicError("Crop node must have 2 or 4 node inputs.");
|
||||
}
|
||||
break;
|
||||
case PrimitiveOpType::Cast:
|
||||
{
|
||||
DataType outputType = (DataType)functionConfig[PrimitiveFunction::AttributeNameNewDataType].Value<int>();
|
||||
switch (outputType)
|
||||
{
|
||||
case DataType::Float:
|
||||
ASSIGN_NEW_NODE2(CastNode, float, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case DataType::Double:
|
||||
ASSIGN_NEW_NODE2(CastNode, double, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case DataType::Float16:
|
||||
ASSIGN_NEW_NODE2(CastNode, half, network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
CNTK::LogicError("Specified op %S not yet supported", PrimitiveOpTypeName(op).c_str());
|
||||
break;
|
||||
|
@ -1224,7 +1349,7 @@ namespace CNTK
|
|||
}
|
||||
else
|
||||
{
|
||||
computationNodePtr = New<UserDefinedV2FunctionNode<ElementType>>(network->GetDeviceId(), internalNodeName, function->shared_from_this());
|
||||
ASSIGN_NEW_NODE(UserDefinedV2FunctionNode, network->GetDeviceId(), internalNodeName, function->shared_from_this());
|
||||
|
||||
// For user defined functions, we only attach unique inputs in the internal computation network since, the UDF
|
||||
// backward implementations directly compute aggregate gradient values for unique inputs
|
||||
|
@ -1244,7 +1369,7 @@ namespace CNTK
|
|||
while (outputs[i] != variable) i++;
|
||||
assert(i < outputs.size());
|
||||
|
||||
computationNodePtr = New<OutputMultiplexerNode<ElementType>>(network->GetDeviceId(), CNTKInternalNodeNameFromUidAndName(variable.Uid(), variable.Name(), useMangledNamesForComputationNodes), i);
|
||||
ASSIGN_NEW_NODE(OutputMultiplexerNode, network->GetDeviceId(), CNTKInternalNodeNameFromUidAndName(variable.Uid(), variable.Name(), useMangledNamesForComputationNodes), i);
|
||||
inputNodesBasePtrs = { variableToNodeMap[outputs[0]] };
|
||||
}
|
||||
|
||||
|
@ -1279,15 +1404,16 @@ namespace CNTK
|
|||
}
|
||||
|
||||
// Create the nodes corresponding to the inputs
|
||||
std::vector<std::shared_ptr<ComputationNode<ElementType>>> inputNodes;
|
||||
std::vector<std::shared_ptr<ComputationNodeBase>> inputNodes;
|
||||
for (auto& inputVar : functionInputs)
|
||||
{
|
||||
// If the inputVar is a constant and not the right DataType let's coerce it to the right type
|
||||
if (inputVar.IsConstant() && (nonConstInputDataType != DataType::Unknown) && (inputVar.GetDataType() != nonConstInputDataType))
|
||||
// except for FP16 that mismatch is needed (e.g. BatchNorm stats in FP16 need to be FP32)
|
||||
if (inputVar.IsConstant() && (nonConstInputDataType != DataType::Unknown) && (nonConstInputDataType != DataType::Float16) && (inputVar.GetDataType() != nonConstInputDataType))
|
||||
inputVar = Constant(inputVar).CloneAs(nonConstInputDataType);
|
||||
|
||||
auto baseNodePtr = GetNode(inputVar, network, builder, fullyDefinedArgumentsMap, variableToNodeMap, isVariableRootMap, inputsToExcludeGradientsFor, useMangledNamesForComputationNodes);
|
||||
inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr->template As<ComputationNode<ElementType>>()->shared_from_this() : nullptr);
|
||||
inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr : nullptr);
|
||||
}
|
||||
|
||||
BlockFunction* blockFunction = dynamic_cast<BlockFunction*>(function);
|
||||
|
@ -1303,7 +1429,7 @@ namespace CNTK
|
|||
return GetNode(variable.BlockFunctionVariableMapping(), network, builder, fullyDefinedArgumentsMap, variableToNodeMap, isVariableRootMap, inputsToExcludeGradientsFor, useMangledNamesForComputationNodes);
|
||||
}
|
||||
else
|
||||
computationNodePtr = CreateComputationNode(variable, function, inputNodes, network, variableToNodeMap, useMangledNamesForComputationNodes);
|
||||
computationNodePtr = CreateComputationNode<ElementType>(variable, function, inputNodes, network, variableToNodeMap, useMangledNamesForComputationNodes);
|
||||
|
||||
PrimitiveFunction* primitiveFunction = dynamic_cast<PrimitiveFunction*>(function);
|
||||
if (!primitiveFunction || (primitiveFunction->OpType() != PrimitiveOpType::Combine))
|
||||
|
@ -1732,6 +1858,9 @@ namespace CNTK
|
|||
case DataType::Double:
|
||||
PopulateComputationNodeValue<double>({ argument, argumentValue }, argumentComputationNode, layoutsPopulated);
|
||||
break;
|
||||
case DataType::Float16:
|
||||
PopulateComputationNodeValue<half>({ argument, argumentValue }, argumentComputationNode, layoutsPopulated);
|
||||
break;
|
||||
default:
|
||||
LogicError("Function '%S' Forward: Unsupported DataType %s.", AsString().c_str(), DataTypeName(argumentValue->GetDataType()));
|
||||
break;
|
||||
|
@ -1775,6 +1904,9 @@ namespace CNTK
|
|||
case DataType::Double:
|
||||
PopulateComputationNodeGradient<double>(gradientVarValuePair, outputComputationNode);
|
||||
break;
|
||||
case DataType::Float16:
|
||||
PopulateComputationNodeGradient<half>(gradientVarValuePair, outputComputationNode);
|
||||
break;
|
||||
default:
|
||||
LogicError("Function '%S' Backward: Unsupported DataType %s.", AsString().c_str(), DataTypeName(gradientValue->GetDataType()));
|
||||
break;
|
||||
|
@ -1816,6 +1948,15 @@ namespace CNTK
|
|||
nodeValue = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(var, computationNode, matrix, layout);
|
||||
break;
|
||||
}
|
||||
case DataType::Float16:
|
||||
{
|
||||
auto& matrix = getGradient ? computationNode->As<ComputationNode<half>>()->Gradient() : computationNode->As<ComputationNode<half>>()->Value();
|
||||
if (varValue == nullptr)
|
||||
nodeValue = MakeSharedObject<PackedValue>(varShape, var.DynamicAxes(), std::make_shared<Matrix<half>>(matrix.AsReference()), layout, /*readOnly =*/ false);
|
||||
else
|
||||
nodeValue = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<half>(var, computationNode, matrix, layout);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
CNTK::LogicError("CompositeFunction::Forward/Backward: Unsupported DataType %s", DataTypeName(var.GetDataType()));
|
||||
break;
|
||||
|
@ -1973,6 +2114,8 @@ namespace CNTK
|
|||
GetComputationNetwork<float>(computeDevice, outputsToRetainBackwardStateFor, requestedOutputVariables, inputsToExcludeGradientsFor, true);
|
||||
else if (dataType == DataType::Double)
|
||||
GetComputationNetwork<double>(computeDevice, outputsToRetainBackwardStateFor, requestedOutputVariables, inputsToExcludeGradientsFor, true);
|
||||
else if (dataType == DataType::Float16)
|
||||
GetComputationNetwork<half>(computeDevice, outputsToRetainBackwardStateFor, requestedOutputVariables, inputsToExcludeGradientsFor, true);
|
||||
else
|
||||
InvalidArgument("Unsupported DataType %s", DataTypeName(dataType));
|
||||
|
||||
|
|
|
@ -286,7 +286,7 @@ namespace CNTK
|
|||
template <typename ElementType>
|
||||
static Microsoft::MSR::CNTK::ComputationNodeBasePtr CreateComputationNode(const Variable& variable,
|
||||
Function* function,
|
||||
const std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNode<ElementType>>>& inputNodes,
|
||||
const std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>& inputNodes,
|
||||
Microsoft::MSR::CNTK::ComputationNetworkPtr& network,
|
||||
std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr>& variableToNodeMap,
|
||||
bool useMangledNamesForComputationNodes);
|
||||
|
@ -359,6 +359,11 @@ namespace CNTK
|
|||
refVar.IsParameter() ? Parameter(refVar).RecordValueUpdate() : Constant(refVar).RecordValueUpdate();
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
static Microsoft::MSR::CNTK::ComputationNodeBasePtr CreateLearnableParameterFromVariable(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, const NDShape& shape, const std::wstring& name);
|
||||
|
||||
static void CastAssignNodeValue(Microsoft::MSR::CNTK::ComputationNodeBasePtr node, DataType dataType, std::shared_ptr<const Microsoft::MSR::CNTK::MatrixBase> matrix);
|
||||
|
||||
private:
|
||||
|
||||
// Set of all primitive functions in the graph underlying 'this' Function. Also keeps the primitive Function objects alive
|
||||
|
|
|
@ -133,7 +133,7 @@ namespace CNTK
|
|||
#endif
|
||||
|
||||
if (info.IsEmpty())
|
||||
PrepaireZeroGradients(gradientValues, info);
|
||||
PrepaireZeroGradients(gradientValues);
|
||||
|
||||
// sorts gradient buffers according to parameter uid, and perform sparse to dense conversion
|
||||
// if !UseSparseGradientAggregationInDataParallelSGD()
|
||||
|
|
|
@ -64,6 +64,8 @@ namespace CNTK
|
|||
return viewPtr->WritableDataBuffer<float>();
|
||||
if (viewPtr->GetDataType() == DataType::Double)
|
||||
return viewPtr->WritableDataBuffer<double>();
|
||||
if (viewPtr->GetDataType() == DataType::Float16)
|
||||
return viewPtr->WritableDataBuffer<float16>();
|
||||
|
||||
LogicError("Unknown DataType");
|
||||
return nullptr; // Make compiler happy.
|
||||
|
@ -398,6 +400,11 @@ namespace CNTK
|
|||
AllReduceData(static_cast<double*>(inputData), static_cast<double*>(outputData), numElements,
|
||||
&allReduceRequests, (inputValue->Device() == DeviceDescriptor::CPUDevice()));
|
||||
}
|
||||
else if (dataType == DataType::Float16)
|
||||
{
|
||||
AllReduceDataHalf(static_cast<half*>(inputData), static_cast<half*>(outputData), numElements,
|
||||
&allReduceRequests, (inputValue->Device() == DeviceDescriptor::CPUDevice()));
|
||||
}
|
||||
else
|
||||
LogicError("MPICommunicator: Unknown DataType.");
|
||||
}
|
||||
|
@ -581,8 +588,10 @@ namespace CNTK
|
|||
|
||||
if (sbc->GetDataType() == DataType::Float)
|
||||
AllReduceData<float>((float*)nz, (float*)nz, requiredElements, nullptr, aggregateOnCPU, MPI_SUM, true);
|
||||
else
|
||||
else if (sbc->GetDataType() == DataType::Double)
|
||||
AllReduceData<double>((double*)nz, (double*)nz, requiredElements, nullptr, aggregateOnCPU, MPI_SUM, true);
|
||||
else if (sbc->GetDataType() == DataType::Float16)
|
||||
AllReduceDataHalf((half*)nz, (half*)nz, requiredElements, nullptr, aggregateOnCPU, MPI_SUM, true);
|
||||
|
||||
if (aggregateOnCPU)
|
||||
{
|
||||
|
@ -719,4 +728,17 @@ namespace CNTK
|
|||
else
|
||||
m_mpi->AllReduceAsync(inputData, outputData, numElements, &(pAllReduceRequests->back()), op);
|
||||
}
|
||||
|
||||
void MPICommunicatorImpl::AllReduceDataHalf(half* inputData, half* outputData, size_t numElements, std::vector<MPI_Request>* pAllReduceRequests, bool dataOnCPU, MPI_Op op, bool forceSync)
|
||||
{
|
||||
if (m_nccl->IsSupported() && !dataOnCPU)
|
||||
{
|
||||
m_nccl->AllReduce(inputData, outputData, numElements, op);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
//half aggregation other than NCCL is not supported
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -143,5 +143,7 @@ namespace CNTK
|
|||
|
||||
template <typename ElemType>
|
||||
void AllReduceData(ElemType* inputData, ElemType* outputData, size_t numElements, std::vector<MPI_Request>* pAllReduceRequests, bool dataOnCPU, MPI_Op op = MPI_SUM, bool forceSync = false);
|
||||
|
||||
void AllReduceDataHalf(half* inputData, half* outputData, size_t numElements, std::vector<MPI_Request>* pAllReduceRequests, bool dataOnCPU, MPI_Op op = MPI_SUM, bool forceSync = false);
|
||||
};
|
||||
}
|
||||
|
|
|
@ -36,7 +36,7 @@ namespace CNTK
|
|||
m_sampleCount = checkpoint[L"totalNumberOfSamplesSeen"].Value<size_t>();
|
||||
}
|
||||
|
||||
void DistributedLearnerBase::PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, MinibatchInfo& info)
|
||||
void DistributedLearnerBase::PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues)
|
||||
{
|
||||
// Need to initialize gradients to 0 in case when it is an empty minibatch.
|
||||
for (auto& g : gradientValues)
|
||||
|
@ -44,10 +44,6 @@ namespace CNTK
|
|||
auto weights = g.first.Value();
|
||||
g.second = MakeSharedObject<NDArrayView>(0, weights->GetDataType(), weights->Shape(), weights->Device());
|
||||
}
|
||||
|
||||
auto dataType = gradientValues.begin()->first.GetDataType();
|
||||
info.evalCriterionValue = MakeSharedObject<NDArrayView>(0, dataType, NDShape{}, DeviceDescriptor::UseDefaultDevice());
|
||||
info.trainingLossValue = MakeSharedObject<NDArrayView>(0, dataType, NDShape{}, DeviceDescriptor::UseDefaultDevice());
|
||||
}
|
||||
|
||||
void DistributedLearnerBase::ConvertToOrdered(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, std::vector<std::pair<Parameter, NDArrayViewPtr>>& result, std::unordered_map<Parameter, NDArrayViewPtr>* convertedGradientValues)
|
||||
|
|
|
@ -22,7 +22,7 @@ namespace CNTK
|
|||
protected:
|
||||
DistributedLearnerBase(DistributedCommunicatorPtr communicator, LearnerPtr learner, size_t distributeAfterSamples, bool convertSparseToDense=true);
|
||||
|
||||
static void PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, MinibatchInfo& info);
|
||||
static void PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues);
|
||||
void ConvertToOrdered(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, std::vector<std::pair<Parameter, NDArrayViewPtr>>& result, std::unordered_map<Parameter, NDArrayViewPtr>* convertedGradientValues = nullptr);
|
||||
|
||||
std::vector<std::pair<Parameter, NDArrayViewPtr>> m_gradientBuffer;
|
||||
|
|
|
@ -2098,14 +2098,14 @@ namespace CNTK
|
|||
|
||||
FunctionPtr classificationErrorComposite;
|
||||
if (axis == Axis(0))
|
||||
classificationErrorComposite = Minus(Constant::Scalar(1.0f), TransposeTimes(labelPlaceholder, Hardmax(predictionPlaceholder)));
|
||||
classificationErrorComposite = Minus(Constant::Scalar(prediction.GetDataType(), 1.0), TransposeTimes(labelPlaceholder, Hardmax(predictionPlaceholder)));
|
||||
else
|
||||
{
|
||||
auto axMax = ReduceMax(predictionPlaceholder, axis);
|
||||
auto pred = Equal(predictionPlaceholder, axMax);
|
||||
auto wrongPred = NotEqual(labelPlaceholder, pred);
|
||||
auto axErr = ReduceSum(wrongPred, axis);
|
||||
auto capErr = GreaterEqual(axErr, Constant::Scalar(1.0f));
|
||||
auto capErr = GreaterEqual(axErr, Constant::Scalar(prediction.GetDataType(), 1.0));
|
||||
classificationErrorComposite = ReduceMean(capErr, Axis::AllStaticAxes());
|
||||
}
|
||||
|
||||
|
@ -2938,6 +2938,17 @@ namespace CNTK
|
|||
operands, Dictionary(), name), name);
|
||||
}
|
||||
|
||||
FunctionPtr Cast(const Variable& nodeInput, DataType outputType, const std::wstring& name)
|
||||
{
|
||||
std::vector<Variable> operands = { nodeInput };
|
||||
Dictionary additionalAttributes;
|
||||
additionalAttributes.Add(
|
||||
PrimitiveFunction::AttributeNameNewDataType, static_cast<int>(outputType));
|
||||
return AsComposite(MakeSharedObject<PrimitiveFunction>(
|
||||
PrimitiveOpType::Cast,
|
||||
operands, std::move(additionalAttributes), name), name);
|
||||
}
|
||||
|
||||
namespace Internal
|
||||
{
|
||||
FunctionPtr IsWithin(const Variable& operand, int offset, const std::wstring& name)
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
#include "Serialization.h"
|
||||
|
||||
#define DISPATCH_TO_TYPED_UPDATE_FUNCTION \
|
||||
switch (smoothedGradientValue->GetDataType()) \
|
||||
switch (gradientValue->GetDataType()) \
|
||||
{ \
|
||||
case DataType::Float: \
|
||||
Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
|
||||
|
@ -18,6 +18,9 @@
|
|||
case DataType::Double: \
|
||||
Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
|
||||
break; \
|
||||
case DataType::Float16: \
|
||||
Update<half>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
|
||||
break; \
|
||||
default: \
|
||||
NOT_IMPLEMENTED; \
|
||||
}
|
||||
|
@ -203,10 +206,10 @@ namespace CNTK
|
|||
|
||||
LearnerBase::LearnerBase(const vector<Parameter>& parameters,
|
||||
const LearningRateSchedule& learningRateSchedule,
|
||||
AdditionalLearningOptions additionalOptions,
|
||||
bool allocateSmoothGradients /* = true */)
|
||||
AdditionalLearningOptions additionalOptions)
|
||||
: Learner(parameters, learningRateSchedule, additionalOptions),
|
||||
m_noiseInjectionSeed(Internal::GenerateRandomSeed())
|
||||
m_noiseInjectionSeed(Internal::GenerateRandomSeed()),
|
||||
m_masterParameterUpdated(false)
|
||||
{
|
||||
if (parameters.empty())
|
||||
InvalidArgument("The parameters list specified to a Learner must not be empty.");
|
||||
|
@ -215,22 +218,40 @@ namespace CNTK
|
|||
|
||||
if (uniqueParameters.size() != parameters.size())
|
||||
InvalidArgument("Learner's parameters list must not contain duplicates.");
|
||||
}
|
||||
|
||||
if (allocateSmoothGradients)
|
||||
void LearnerBase::AllocateSmoothedGradients(const std::vector<Parameter>& parameters, size_t factor, size_t fp16Factor)
|
||||
{
|
||||
for (const auto& parameter : parameters)
|
||||
{
|
||||
NDArrayViewPtr view = AllocateNDArrayView(parameter, parameter.Shape());
|
||||
NDArrayViewPtr view = AllocateSmoothedGradientFor(parameter, factor, fp16Factor);
|
||||
m_smoothedGradientValues.emplace(parameter, view);
|
||||
}
|
||||
}
|
||||
|
||||
/*static*/ NDArrayViewPtr LearnerBase::AllocateSmoothedGradientFor(const Parameter& parameter, size_t factor, size_t fp16Factor)
|
||||
{
|
||||
// float16 parameter needs extra buffer for master-copy of weights
|
||||
if (parameter.GetDataType() == DataType::Float16) factor += fp16Factor;
|
||||
|
||||
const auto paramShape = GetMatrixShape(parameter);
|
||||
NDShape shape;
|
||||
if (factor == 0)
|
||||
{
|
||||
shape = NDShape({});
|
||||
}
|
||||
else
|
||||
{
|
||||
if (factor == 1)
|
||||
shape = parameter.Shape();
|
||||
else
|
||||
shape = NDShape({ paramShape[0], factor * paramShape[1] });
|
||||
}
|
||||
|
||||
/*static*/ NDArrayViewPtr LearnerBase::AllocateNDArrayView(const Parameter& parameter, const NDShape& shape)
|
||||
if (parameter.GetDataType() != DataType::Double)
|
||||
{
|
||||
if (parameter.GetDataType() == DataType::Float)
|
||||
{
|
||||
return MakeSharedObject<NDArrayView>(float(0.0), shape, parameter.Value()->Device());
|
||||
// float and half both have smoothed gradient in float
|
||||
return MakeSharedObject<NDArrayView>(0.0f, shape, parameter.Value()->Device());
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -245,11 +266,16 @@ namespace CNTK
|
|||
auto matrix = GetMatrix<float>(parameter.Value());
|
||||
return{ matrix->GetNumRows(), matrix->GetNumCols() };
|
||||
}
|
||||
else
|
||||
else if (parameter.GetDataType() == DataType::Double)
|
||||
{
|
||||
auto matrix = GetMatrix<double>(parameter.Value());
|
||||
return{ matrix->GetNumRows(), matrix->GetNumCols() };
|
||||
}
|
||||
else
|
||||
{
|
||||
auto matrix = GetMatrix<half>(parameter.Value());
|
||||
return{ matrix->GetNumRows(), matrix->GetNumCols() };
|
||||
}
|
||||
}
|
||||
|
||||
/*virtual*/ bool LearnerBase::Update(unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount, bool sweepEnd) /*override*/
|
||||
|
@ -267,10 +293,22 @@ namespace CNTK
|
|||
|
||||
UpdateOnMinibatch(trainingSampleCount);
|
||||
|
||||
bool needUpdateMasterParameter = !m_masterParameterUpdated;
|
||||
for (const auto& parameter : Parameters())
|
||||
{
|
||||
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
|
||||
const auto& gradientValue = gradientValues.at(parameter);
|
||||
|
||||
if (needUpdateMasterParameter && parameter.GetDataType() == DataType::Float16)
|
||||
{
|
||||
// convert fp16 parameter to fp32
|
||||
auto sg = smoothedGradientValue->GetWritableMatrix<float>();
|
||||
auto pv16 = parameter.Value()->GetWritableMatrix<half>();
|
||||
size_t factor = sg->GetNumCols() / pv16->GetNumCols();
|
||||
auto pv = sg->ColumnSlice(pv16->GetNumCols() * (factor - 1), pv16->GetNumCols());
|
||||
pv.CastAssignValuesOf(*pv16);
|
||||
}
|
||||
|
||||
// TODO: make this a runtime parameter.
|
||||
#if DUMPOUTPUT
|
||||
LOGPRINTF(stderr, "Update_%ls\n", parameter.Uid().c_str());
|
||||
|
@ -303,6 +341,11 @@ namespace CNTK
|
|||
LogicError("%ls has NaNs in parameter values after parameter update.", parameter.Uid().c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
if (needUpdateMasterParameter)
|
||||
{
|
||||
m_masterParameterUpdated = true;
|
||||
}
|
||||
m_sampleCount += trainingSampleCount;
|
||||
m_minibatchCount++;
|
||||
if (sweepEnd)
|
||||
|
@ -319,7 +362,19 @@ namespace CNTK
|
|||
{
|
||||
const auto& parameterValue = parameter.Value();
|
||||
PreProcess<ElementType>(parameterValue, gradientValue, trainingSampleCount);
|
||||
|
||||
Update(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
|
||||
|
||||
if (parameter.GetDataType() == DataType::Float16)
|
||||
{
|
||||
// convert fp32 parameter to fp16 after update
|
||||
auto sg = smoothedGradientValue->GetWritableMatrix<float>();
|
||||
auto pv16 = parameterValue->GetWritableMatrix<half>();
|
||||
size_t factor = sg->GetNumCols() / pv16->GetNumCols();
|
||||
auto pv = sg->ColumnSlice(pv16->GetNumCols() * (factor - 1), pv16->GetNumCols());
|
||||
pv16->CastAssignValuesOf(pv);
|
||||
}
|
||||
|
||||
PostProcess<ElementType>(parameter, gradientValue, trainingSampleCount);
|
||||
|
||||
auto paramRef = parameter;
|
||||
|
@ -344,6 +399,7 @@ namespace CNTK
|
|||
checkpoint[sweepCountKey] = m_sweepCount;
|
||||
checkpoint[learningRateScheduleKey] = m_learningRateSchedule.Serialize();
|
||||
checkpoint[noiseInjectionSeedKey] = m_noiseInjectionSeed;
|
||||
checkpoint[masterParameterUpdatedKey] = m_masterParameterUpdated;
|
||||
|
||||
// TODO: should we also save momentum schedule into the checkpoint?
|
||||
// If that is the case, need to be able to override this method in subclasses.
|
||||
|
@ -390,6 +446,11 @@ namespace CNTK
|
|||
m_noiseInjectionSeed = checkpoint[noiseInjectionSeedKey].Value<size_t>();
|
||||
}
|
||||
|
||||
if (checkpoint.Contains(masterParameterUpdatedKey))
|
||||
{
|
||||
m_masterParameterUpdated = checkpoint[masterParameterUpdatedKey].Value<bool>();
|
||||
}
|
||||
|
||||
// TODO: which learning rate schedule should take precedence here?
|
||||
// The one given at construction time or the one loaded from a checkpoint?
|
||||
m_learningRateSchedule = TrainingParameterSchedule<double>::Deserialize(checkpoint[learningRateScheduleKey].Value<Dictionary>());
|
||||
|
@ -464,23 +525,6 @@ namespace CNTK
|
|||
}
|
||||
}
|
||||
|
||||
LearnerSGD::LearnerSGD(const std::vector<Parameter>& parameters,
|
||||
const LearningRateSchedule& learningRateSchedule,
|
||||
AdditionalLearningOptions additionalOptions,
|
||||
bool allocateSmoothGradients)
|
||||
: LearnerBase(parameters, learningRateSchedule, additionalOptions, allocateSmoothGradients)
|
||||
{
|
||||
if (!allocateSmoothGradients)
|
||||
{
|
||||
// the vanilla sgd does not need the smooth gradients per se,
|
||||
// insert dummy nd views instead.
|
||||
for (const auto& parameter : parameters)
|
||||
{
|
||||
m_smoothedGradientValues.emplace(parameter, AllocateNDArrayView(parameter, {}));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*virtual*/ void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
|
||||
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) /*override*/
|
||||
{
|
||||
|
@ -514,7 +558,20 @@ namespace CNTK
|
|||
{
|
||||
ReportTrainingParameterValue(m_momentumSchedule, L"Momentum");
|
||||
|
||||
DISPATCH_TO_TYPED_UPDATE_FUNCTION;
|
||||
switch (gradientValue->GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
|
||||
break;
|
||||
case DataType::Double:
|
||||
Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
|
||||
break;
|
||||
case DataType::Float16:
|
||||
UpdateHalf(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
|
||||
break;
|
||||
default:
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
|
@ -555,10 +612,42 @@ namespace CNTK
|
|||
learningRate, momentum, unitGainFactor);
|
||||
}
|
||||
|
||||
void LearnerMomentumSGD::UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
|
||||
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
|
||||
{
|
||||
const auto& compoundMatrix = GetWritableMatrix<float>(smoothedGradientValue);
|
||||
const auto& gradientMatrix = GetWritableMatrix<half>(gradientValue);
|
||||
auto smoothedGradientMatrix = compoundMatrix->ColumnSlice(0, gradientMatrix->GetNumCols());
|
||||
auto tempGradientMatrix = compoundMatrix->ColumnSlice(gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
|
||||
auto parameterMatrix = compoundMatrix->ColumnSlice(2 * gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
|
||||
|
||||
tempGradientMatrix.CastAssignValuesOf(*gradientMatrix);
|
||||
|
||||
const auto learningRate = float(LearningRate(trainingSampleCount));
|
||||
const auto momentum = float(MomentumValueForMB(trainingSampleCount));
|
||||
const auto unitGainFactor = UnitGainFactor<float>(trainingSampleCount);
|
||||
|
||||
parameterMatrix.MomentumSGDUpdate(tempGradientMatrix, smoothedGradientMatrix,
|
||||
learningRate, momentum, unitGainFactor);
|
||||
}
|
||||
|
||||
/*virtual*/ void LearnerNesterov::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
|
||||
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) /*override*/
|
||||
{
|
||||
DISPATCH_TO_TYPED_UPDATE_FUNCTION;
|
||||
switch (gradientValue->GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
|
||||
break;
|
||||
case DataType::Double:
|
||||
Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
|
||||
break;
|
||||
case DataType::Float16:
|
||||
UpdateHalf(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
|
||||
break;
|
||||
default:
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
|
@ -575,11 +664,30 @@ namespace CNTK
|
|||
learningRate, momentum, unitGainFactor);
|
||||
}
|
||||
|
||||
void LearnerNesterov::UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
|
||||
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
|
||||
{
|
||||
const auto& compoundMatrix = GetWritableMatrix<float>(smoothedGradientValue);
|
||||
const auto& gradientMatrix = GetWritableMatrix<half>(gradientValue);
|
||||
auto smoothedGradientMatrix = compoundMatrix->ColumnSlice(0, gradientMatrix->GetNumCols());
|
||||
auto tempGradientMatrix = compoundMatrix->ColumnSlice(gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
|
||||
auto parameterMatrix = compoundMatrix->ColumnSlice(2 * gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
|
||||
|
||||
tempGradientMatrix.CastAssignValuesOf(*gradientMatrix);
|
||||
|
||||
const auto learningRate = float(LearningRate(trainingSampleCount));
|
||||
const auto momentum = float(MomentumValueForMB(trainingSampleCount));
|
||||
const auto unitGainFactor = UnitGainFactor<float>(trainingSampleCount);
|
||||
|
||||
parameterMatrix.NesterovAcceleratedMomentumSGDUpdate(tempGradientMatrix, smoothedGradientMatrix,
|
||||
learningRate, momentum, unitGainFactor);
|
||||
}
|
||||
|
||||
LearnerAdaGrad::LearnerAdaGrad(const std::vector<Parameter>& parameters,
|
||||
const LearningRateSchedule& learningRateSchedule,
|
||||
bool needAveMultiplier,
|
||||
AdditionalLearningOptions additionalOptions)
|
||||
: LearnerBase(parameters, learningRateSchedule, additionalOptions, /*allocateSmoothGradients*/ false),
|
||||
: LearnerBase(parameters, learningRateSchedule, additionalOptions),
|
||||
m_needAveMultiplier(needAveMultiplier)
|
||||
{
|
||||
for (const auto& parameter : parameters)
|
||||
|
@ -591,8 +699,7 @@ namespace CNTK
|
|||
factor = 2;
|
||||
}
|
||||
|
||||
const auto shape = GetMatrixShape(parameter);
|
||||
NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], factor * shape[1] });
|
||||
NDArrayViewPtr view = AllocateSmoothedGradientFor(parameter, factor);
|
||||
|
||||
m_smoothedGradientValues.emplace(parameter, view);
|
||||
}
|
||||
|
@ -621,21 +728,29 @@ namespace CNTK
|
|||
const LearningRateSchedule& learningRateSchedule,
|
||||
double rho, double epsilon,
|
||||
AdditionalLearningOptions additionalOptions)
|
||||
: LearnerBase(parameters, learningRateSchedule, additionalOptions, /*allocateSmoothGradients*/ false),
|
||||
: LearnerBase(parameters, learningRateSchedule, additionalOptions),
|
||||
m_rho(rho), m_epsilon(epsilon)
|
||||
{
|
||||
for (const auto& parameter : parameters)
|
||||
{
|
||||
const auto shape = GetMatrixShape(parameter);
|
||||
NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], 2 * shape[1] });
|
||||
m_smoothedGradientValues.emplace(parameter, view);
|
||||
}
|
||||
AllocateSmoothedGradients(parameters, 2);
|
||||
}
|
||||
|
||||
/*virtual*/ void LearnerAdaDelta::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
|
||||
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) /*override*/
|
||||
{
|
||||
DISPATCH_TO_TYPED_UPDATE_FUNCTION;
|
||||
switch (gradientValue->GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
Update<float, float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
|
||||
break;
|
||||
case DataType::Double:
|
||||
Update<double, double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
|
||||
break;
|
||||
case DataType::Float16:
|
||||
Update<half, float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
|
||||
break;
|
||||
default:
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
}
|
||||
|
||||
// When the gradients are sparse, we update the corresponding internal buffers of adadelta in a sparse way
|
||||
|
@ -645,11 +760,16 @@ namespace CNTK
|
|||
// TODO: consider exposing this somehow so that it is easy to test by setting it to small value.
|
||||
/* static */ const int LearnerAdaDelta::s_SyncInterval = 1 << 20;
|
||||
|
||||
template <typename ElementType>
|
||||
template <typename GradType, typename AccumType>
|
||||
void LearnerAdaDelta::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
|
||||
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount)
|
||||
{
|
||||
GET_WRITABLE_MATRICES
|
||||
const auto& gradientMatrix = GetWritableMatrix<GradType>(gradientValue);
|
||||
const auto& smoothedGradientMatrix = GetWritableMatrix<AccumType>(smoothedGradientValue);
|
||||
// parameter is accumulated to fp32 for fp16 gradient in the master copy (allocated in last part in smoothedGradient)
|
||||
auto parameterMatrix = (std::is_same<GradType, half>::value) ?
|
||||
smoothedGradientMatrix->ColumnSlice(smoothedGradientMatrix->GetNumCols() - gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols()) :
|
||||
GetWritableMatrix<AccumType>(parameter.Value())->ColumnSlice(0, gradientMatrix->GetNumCols());
|
||||
|
||||
const auto learningRate = LearningRate(trainingSampleCount);
|
||||
|
||||
|
@ -685,14 +805,14 @@ namespace CNTK
|
|||
if (currentTimestamp >= LearnerAdaDelta::s_SyncInterval)
|
||||
{
|
||||
// Once in a while sync the state and reset the timestamps and current time to 0
|
||||
smoothedGradientMatrix->AdaDeltaFlushState(numCols, (ElementType)m_rho, timestamps, currentTimestamp);
|
||||
smoothedGradientMatrix->AdaDeltaFlushState(numCols, (AccumType)m_rho, timestamps, currentTimestamp);
|
||||
m_currentTime[parameter] = currentTimestamp = 0;
|
||||
}
|
||||
currentTimestamp += 1;
|
||||
m_currentTime[parameter] = currentTimestamp;
|
||||
}
|
||||
|
||||
smoothedGradientMatrix->AdaDeltaUpdate(*gradientMatrix, *parameterMatrix, (ElementType)learningRate, (ElementType)m_rho, (ElementType)m_epsilon, timestamps, currentTimestamp);
|
||||
smoothedGradientMatrix->template AdaDeltaUpdate<GradType>(*gradientMatrix, parameterMatrix, (AccumType)learningRate, (AccumType)m_rho, (AccumType)m_epsilon, timestamps, currentTimestamp);
|
||||
}
|
||||
|
||||
/*virtual*/ Dictionary LearnerAdaDelta::CreateCheckpoint() /*override*/
|
||||
|
@ -713,12 +833,15 @@ namespace CNTK
|
|||
const auto& smoothedGradientMatrix = GetWritableMatrix<float>(smoothedGradientValue);
|
||||
smoothedGradientMatrix->AdaDeltaFlushState(numCols, (float)m_rho, timestamps, currentTimestamp);
|
||||
}
|
||||
else
|
||||
else if (parameter.GetDataType() == CNTK::DataType::Double)
|
||||
{
|
||||
const auto numCols = GetMatrix<double>(parameter.Value())->GetNumCols();
|
||||
const auto& smoothedGradientMatrix = GetWritableMatrix<double>(smoothedGradientValue);
|
||||
smoothedGradientMatrix->AdaDeltaFlushState(numCols, (double)m_rho, timestamps, currentTimestamp);
|
||||
}
|
||||
else
|
||||
LogicError("Unexpected parameter data type");
|
||||
|
||||
m_currentTime[parameter] = 0;
|
||||
}
|
||||
return LearnerBase::CreateCheckpoint();
|
||||
|
@ -748,16 +871,10 @@ namespace CNTK
|
|||
const MomentumSchedule& varianceMomentumSchedule,
|
||||
AdditionalLearningOptions additionalOptions)
|
||||
: LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule,
|
||||
unitGain, additionalOptions, /*allocateSmoothGradients*/ false),
|
||||
unitGain, additionalOptions, 2),
|
||||
m_varianceMomentumSchedule(varianceMomentumSchedule),
|
||||
m_smoothedCount(0.0)
|
||||
{
|
||||
for (const auto& parameter : parameters)
|
||||
{
|
||||
const auto shape = GetMatrixShape(parameter);
|
||||
NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], 2 * shape[1] });
|
||||
m_smoothedGradientValues.emplace(parameter, view);
|
||||
}
|
||||
}
|
||||
|
||||
/*virtual*/ Dictionary LearnerFSAdaGrad::CreateCheckpoint() /*override*/
|
||||
|
@ -823,7 +940,7 @@ namespace CNTK
|
|||
bool adamax,
|
||||
AdditionalLearningOptions additionalOptions)
|
||||
: LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule,
|
||||
unitGain, additionalOptions, /*allocateSmoothGradients*/ false),
|
||||
unitGain, additionalOptions, 2),
|
||||
m_varianceMomentumSchedule(varianceMomentumSchedule), m_epsilon(epsilon),
|
||||
m_adamax(adamax)
|
||||
{
|
||||
|
@ -833,12 +950,8 @@ namespace CNTK
|
|||
InvalidArgument("Epsilon should be non-negative. You are trying to set it to %g.", m_epsilon);
|
||||
}
|
||||
|
||||
for (const auto& parameter : parameters)
|
||||
{
|
||||
const auto shape = GetMatrixShape(parameter);
|
||||
NDArrayViewPtr view = AllocateNDArrayView(parameter, {shape[0], 2 * shape[1]});
|
||||
m_smoothedGradientValues.emplace(parameter, view);
|
||||
}
|
||||
AllocateSmoothedGradients(parameters, 2);
|
||||
|
||||
m_smoothedCount = 0.0;
|
||||
}
|
||||
|
||||
|
@ -893,7 +1006,7 @@ namespace CNTK
|
|||
double gamma, double inc, double dec, double max, double min,
|
||||
bool needAveMultiplier,
|
||||
AdditionalLearningOptions additionalOptions)
|
||||
: LearnerBase(parameters, learningRateSchedule, additionalOptions, /*allocateSmoothGradients*/ false),
|
||||
: LearnerBase(parameters, learningRateSchedule, additionalOptions),
|
||||
m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min), m_needAveMultiplier(needAveMultiplier)
|
||||
{
|
||||
// validation of learner settings
|
||||
|
@ -922,7 +1035,7 @@ namespace CNTK
|
|||
}
|
||||
|
||||
const auto shape = GetMatrixShape(parameter);
|
||||
NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], factor * shape[1] });
|
||||
NDArrayViewPtr view = AllocateSmoothedGradientFor(parameter, factor);
|
||||
|
||||
m_smoothedGradientValues.emplace(parameter, view);
|
||||
}
|
||||
|
@ -996,7 +1109,7 @@ namespace CNTK
|
|||
bool unitGain,
|
||||
AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
|
||||
{
|
||||
return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions);
|
||||
return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions, 1);
|
||||
}
|
||||
|
||||
LearnerPtr NesterovLearner(const vector<Parameter>& parameters,
|
||||
|
@ -1059,7 +1172,7 @@ namespace CNTK
|
|||
|
||||
|
||||
LearnerUniversal::LearnerUniversal(const std::vector<Parameter>& parameters, const ParameterUpdateFunctor& func)
|
||||
: LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions(), /*allocateSmoothGradients*/ false)
|
||||
: LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions())
|
||||
{
|
||||
std::vector<Variable> gradients;
|
||||
std::vector<FunctionPtr> functions;
|
||||
|
@ -1083,7 +1196,7 @@ namespace CNTK
|
|||
}
|
||||
|
||||
LearnerUniversal::LearnerUniversal(const std::vector<Parameter>& parameters, const std::vector<Variable>& gradients, FunctionPtr updateFunc)
|
||||
: LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions(), /*allocateSmoothGradients*/ false)
|
||||
: LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions())
|
||||
{
|
||||
ValidateInput(parameters, gradients, updateFunc);
|
||||
}
|
||||
|
@ -1107,7 +1220,7 @@ namespace CNTK
|
|||
fprintf(stderr, "WARNING: Update function does not contain the gradient for parameter %ls in its computation\n", param.AsString().c_str());
|
||||
m_parameter_gradient_map.insert({parameters[i], gradients[i]});
|
||||
}
|
||||
AllocateDummySmoothedGradients(parameters);
|
||||
AllocateSmoothedGradients(parameters, 0);
|
||||
m_update_func = updateFunc;
|
||||
}
|
||||
|
||||
|
|
|
@ -30,14 +30,11 @@ namespace CNTK
|
|||
virtual void ResetSmoothedGradients() override;
|
||||
|
||||
protected:
|
||||
// allocateSmoothGradients flag specifies whether NDArrayViews for smoothed gradients can be allocated
|
||||
// in the base class constructor (in which case they are allocated with the shapes identical to the shapes of
|
||||
// the corresponding parameters) or if the allocation should be deferred to the subclass constructor (which
|
||||
// performs allocation that is specific to the particular learner, see FSAdaGrad and RMSProp).
|
||||
LearnerBase(const std::vector<Parameter>& parameters,
|
||||
const LearningRateSchedule& learningRateSchedule,
|
||||
AdditionalLearningOptions additionalOptions,
|
||||
bool allocateSmoothGradients = true);
|
||||
AdditionalLearningOptions additionalOptions);
|
||||
|
||||
void AllocateSmoothedGradients(const std::vector<Parameter>& parameters, size_t factor, size_t fp16Factor = 1);
|
||||
|
||||
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) = 0;
|
||||
|
||||
|
@ -82,6 +79,8 @@ namespace CNTK
|
|||
|
||||
std::unordered_map<Parameter, NDArrayViewPtr> m_smoothedGradientValues;
|
||||
|
||||
bool m_masterParameterUpdated; // whether the master copy of parameters are updated
|
||||
|
||||
mutable size_t m_noiseInjectionSeed;
|
||||
|
||||
// The following four static protected methods expose private methods of NDArrayView class
|
||||
|
@ -113,7 +112,7 @@ namespace CNTK
|
|||
|
||||
// Returns an NDArrayView with the required shape, with the same data type as parameter value
|
||||
// and allocated on the same device.
|
||||
static NDArrayViewPtr AllocateNDArrayView(const Parameter& parameter, const NDShape& shape);
|
||||
static NDArrayViewPtr AllocateSmoothedGradientFor(const Parameter& parameter, size_t factor, size_t fp16Factor = 1);
|
||||
|
||||
// Retrieves the shape of the matrix corresponding to the parameter value.
|
||||
static NDShape GetMatrixShape(const Parameter& parameter);
|
||||
|
@ -142,8 +141,11 @@ namespace CNTK
|
|||
public:
|
||||
LearnerSGD(const std::vector<Parameter>& parameters,
|
||||
const LearningRateSchedule& learningRateSchedule,
|
||||
AdditionalLearningOptions additionalOptions,
|
||||
bool allocateSmoothGradients = false);
|
||||
AdditionalLearningOptions additionalOptions)
|
||||
: LearnerBase(parameters, learningRateSchedule, additionalOptions)
|
||||
{
|
||||
AllocateSmoothedGradients(parameters, 0);
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
|
@ -162,11 +164,13 @@ namespace CNTK
|
|||
const MomentumSchedule& momentumSchedule,
|
||||
bool unitGain,
|
||||
AdditionalLearningOptions additionalOptions,
|
||||
bool allocateSmoothGradients = true)
|
||||
: LearnerBase(parameters, learningRateSchedule, additionalOptions, allocateSmoothGradients),
|
||||
size_t smoothGradientFactor)
|
||||
: LearnerBase(parameters, learningRateSchedule, additionalOptions),
|
||||
m_momentumSchedule(momentumSchedule),
|
||||
m_unitGain(unitGain)
|
||||
{ }
|
||||
{
|
||||
AllocateSmoothedGradients(parameters, smoothGradientFactor, 2);
|
||||
}
|
||||
|
||||
// returns current per-minibatch momentum value.
|
||||
virtual double MomentumValueForMB(size_t minibatchSize) const
|
||||
|
@ -177,9 +181,11 @@ namespace CNTK
|
|||
protected:
|
||||
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) override;
|
||||
|
||||
template <typename ElementType>
|
||||
template <typename ElemType>
|
||||
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
|
||||
|
||||
void UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
|
||||
|
||||
// returns current per-minibatch momentum value from the provided schedule.
|
||||
double MomentumValueForMB(const MomentumSchedule& schedule, size_t minibatchSize) const;
|
||||
|
||||
|
@ -216,7 +222,7 @@ namespace CNTK
|
|||
const MomentumSchedule& momentumSchedule,
|
||||
bool unitGain,
|
||||
AdditionalLearningOptions additionalOptions)
|
||||
: LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions, /*allocateSmoothGradients*/ true)
|
||||
: LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions, 1)
|
||||
{}
|
||||
|
||||
protected:
|
||||
|
@ -224,6 +230,7 @@ namespace CNTK
|
|||
|
||||
template <typename ElementType>
|
||||
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
|
||||
void UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
|
||||
};
|
||||
|
||||
class LearnerAdaGrad : public LearnerBase
|
||||
|
@ -268,7 +275,7 @@ namespace CNTK
|
|||
|
||||
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) override;
|
||||
|
||||
template <typename ElementType>
|
||||
template <typename GradType, typename AccumType>
|
||||
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount);
|
||||
|
||||
virtual Dictionary CreateCheckpoint() override;
|
||||
|
@ -404,14 +411,6 @@ namespace CNTK
|
|||
virtual bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount, bool sweepEnd) override;
|
||||
|
||||
private:
|
||||
void AllocateDummySmoothedGradients(const std::vector<Parameter>& parameters)
|
||||
{
|
||||
for (const auto& parameter : parameters)
|
||||
{
|
||||
m_smoothedGradientValues.emplace(parameter, AllocateNDArrayView(parameter, {}));
|
||||
}
|
||||
}
|
||||
|
||||
void ValidateInput(const std::vector<Parameter>& parameters, const std::vector<Variable>& gradients, FunctionPtr updateFunc);
|
||||
|
||||
|
||||
|
|
|
@ -17,8 +17,20 @@ using namespace Microsoft::MSR::CNTK;
|
|||
|
||||
namespace CNTK
|
||||
{
|
||||
template <typename ElementType>
|
||||
static TensorView<ElementType>* AllocateTensorView(const NDShape& viewShape,
|
||||
template<typename ElemType>
|
||||
inline ElemType quiet_NaN()
|
||||
{
|
||||
return std::numeric_limits<ElemType>::quiet_NaN();
|
||||
}
|
||||
|
||||
template<>
|
||||
inline float16 quiet_NaN<float16>()
|
||||
{
|
||||
return float16(std::numeric_limits<float>::quiet_NaN());
|
||||
}
|
||||
|
||||
template <typename V1ElemType>
|
||||
static TensorView<V1ElemType>* AllocateTensorView(const NDShape& viewShape,
|
||||
const DeviceDescriptor& device,
|
||||
void* dataBuffer,
|
||||
size_t bufferSizeInBytes)
|
||||
|
@ -26,13 +38,13 @@ namespace CNTK
|
|||
if (dataBuffer == nullptr)
|
||||
InvalidArgument("Cannot create a NDArrayView over a null data buffer.");
|
||||
|
||||
if (bufferSizeInBytes < (viewShape.TotalSize() * sizeof(ElementType)))
|
||||
if (bufferSizeInBytes < (viewShape.TotalSize() * sizeof(V1ElemType)))
|
||||
InvalidArgument("Size (%d) of the specified buffer for creating the NDArrayView is smaller than the specified view shape '%S'.",
|
||||
(int)bufferSizeInBytes, viewShape.AsString().c_str());
|
||||
|
||||
auto matrixDims = GetMatrixDimensions(viewShape);
|
||||
std::shared_ptr<Matrix<ElementType>> matrix = std::make_shared<Matrix<ElementType>>(matrixDims.first, matrixDims.second, (ElementType*)dataBuffer, AsCNTKImplDeviceId(device), matrixFlagDontOwnBuffer);
|
||||
return new TensorView<ElementType>(matrix, AsTensorViewShape(viewShape));
|
||||
std::shared_ptr<Matrix<V1ElemType>> matrix = std::make_shared<Matrix<V1ElemType>>(matrixDims.first, matrixDims.second, (V1ElemType*)dataBuffer, AsCNTKImplDeviceId(device), matrixFlagDontOwnBuffer);
|
||||
return new TensorView<V1ElemType>(matrix, AsTensorViewShape(viewShape));
|
||||
}
|
||||
|
||||
static void* AllocateTensorView(CNTK::DataType dataType,
|
||||
|
@ -47,26 +59,28 @@ namespace CNTK
|
|||
return AllocateTensorView<float>(viewShape, device, dataBuffer, bufferSizeInBytes);
|
||||
case DataType::Double:
|
||||
return AllocateTensorView<double>(viewShape, device, dataBuffer, bufferSizeInBytes);
|
||||
case DataType::Float16:
|
||||
return AllocateTensorView<half>(viewShape, device, dataBuffer, bufferSizeInBytes);
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(dataType));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
static TensorView<ElementType>* AllocateTensorView(const NDShape& viewShape,
|
||||
template<typename V1ElemType>
|
||||
static TensorView<V1ElemType>* AllocateTensorView(const NDShape& viewShape,
|
||||
CNTK::StorageFormat storageType,
|
||||
const DeviceDescriptor& device,
|
||||
size_t numNonZeroValues = 0)
|
||||
{
|
||||
auto matrixDims = GetMatrixDimensions(viewShape);
|
||||
std::shared_ptr<Matrix<ElementType>> matrix = std::make_shared<Matrix<ElementType>>(matrixDims.first,
|
||||
std::shared_ptr<Matrix<V1ElemType>> matrix = std::make_shared<Matrix<V1ElemType>>(matrixDims.first,
|
||||
matrixDims.second,
|
||||
AsCNTKImplDeviceId(device),
|
||||
IsSparseStorageFormat(storageType) ? MatrixType::SPARSE : MatrixType::DENSE,
|
||||
AsCNTKImplMatrixFormat(storageType),
|
||||
numNonZeroValues);
|
||||
return new TensorView<ElementType>(matrix, AsTensorViewShape(viewShape));
|
||||
return new TensorView<V1ElemType>(matrix, AsTensorViewShape(viewShape));
|
||||
}
|
||||
|
||||
static void* AllocateTensorView(CNTK::DataType dataType,
|
||||
|
@ -81,6 +95,8 @@ namespace CNTK
|
|||
return AllocateTensorView<float>(viewShape, storageType, device, numNonZeroValues);
|
||||
case DataType::Double:
|
||||
return AllocateTensorView<double>(viewShape, storageType, device, numNonZeroValues);
|
||||
case DataType::Float16:
|
||||
return AllocateTensorView<half>(viewShape, storageType, device, numNonZeroValues);
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(dataType));
|
||||
break;
|
||||
|
@ -92,17 +108,37 @@ namespace CNTK
|
|||
{
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/)
|
||||
: NDArrayView(AsDataType<ElementType>(), device, StorageFormat::SparseCSC, viewShape, false, AllocateTensorView<ElementType>(viewShape, StorageFormat::SparseCSC, device, numNonZeroValues))
|
||||
NDArrayView::NDArrayView(CNTK::DataType dataType, const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const void* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/)
|
||||
: NDArrayView(dataType, device, StorageFormat::SparseCSC, viewShape, false, AllocateTensorView(dataType, StorageFormat::SparseCSC, viewShape, device, numNonZeroValues * DataTypeSize(dataType)))
|
||||
{
|
||||
if ((colStarts == nullptr) || (rowIndices == nullptr) || (nonZeroValues == nullptr) || (numNonZeroValues == 0) || (numNonZeroValues > viewShape.TotalSize()))
|
||||
InvalidArgument("Invalid sparse CSC format data specified for construction of NDArrayView with shape '%S'; "
|
||||
"either one of the specified buffers is null or the count (%d) of non-zero values is invalid.",
|
||||
viewShape.AsString().c_str(), (int)numNonZeroValues);
|
||||
|
||||
auto sparseMatrix = GetWritableMatrix<ElementType>(1);
|
||||
sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
|
||||
switch (dataType)
|
||||
{
|
||||
case DataType::Float:
|
||||
{
|
||||
auto sparseMatrix = GetWritableMatrix<float>(1);
|
||||
sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, (const float*)nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
|
||||
break;
|
||||
}
|
||||
case DataType::Double:
|
||||
{
|
||||
auto sparseMatrix = GetWritableMatrix<double>(1);
|
||||
sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, (const double*)nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
|
||||
break;
|
||||
}
|
||||
case DataType::Float16:
|
||||
{
|
||||
auto sparseMatrix = GetWritableMatrix<half>(1);
|
||||
sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, (const half*)nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(dataType));
|
||||
break;
|
||||
}
|
||||
m_isReadOnly = readOnly;
|
||||
}
|
||||
|
||||
|
@ -118,6 +154,9 @@ namespace CNTK
|
|||
case DataType::Double:
|
||||
delete GetTensorView<double>();
|
||||
break;
|
||||
case DataType::Float16:
|
||||
delete GetTensorView<half>();
|
||||
break;
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(m_dataType));
|
||||
break;
|
||||
|
@ -136,6 +175,8 @@ namespace CNTK
|
|||
{
|
||||
if (GetDataType() == DataType::Double)
|
||||
SetValue((double)value);
|
||||
else if (GetDataType() == DataType::Float16)
|
||||
SetValue((float16)value);
|
||||
else
|
||||
{
|
||||
if (IsSparse())
|
||||
|
@ -167,12 +208,25 @@ namespace CNTK
|
|||
auto currentMatrix = GetMatrix<double>();
|
||||
return currentMatrix->IsView();
|
||||
}
|
||||
case DataType::Float16:
|
||||
{
|
||||
auto currentMatrix = GetMatrix<half>();
|
||||
return currentMatrix->IsView();
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
/*static*/ std::shared_ptr<Matrix<ElementType>> NDArrayView::GetMatrixImpl(const TensorView<ElementType>* tensorView, size_t rowColSplitPoint)
|
||||
void NDArrayView::SetValue(float16 value)
|
||||
{
|
||||
if (IsSparse())
|
||||
LogicError("NDArrayView::SetValue: Setting a NDArrayView contents to a scalar is only allowed for objects with dense storage format.");
|
||||
|
||||
GetWritableMatrix<half>()->SetValue(*reinterpret_cast<half*>(&value));
|
||||
}
|
||||
|
||||
template <typename V1ElemType>
|
||||
/*static*/ std::shared_ptr<Matrix<V1ElemType>> NDArrayView::GetMatrixImpl(const TensorView<V1ElemType>* tensorView, size_t rowColSplitPoint)
|
||||
{
|
||||
auto tensorShape = tensorView->GetShape();
|
||||
|
||||
|
@ -213,34 +267,66 @@ namespace CNTK
|
|||
return tensorView->Reshaped(tensorShape).AsMatrix();
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
std::shared_ptr<const Matrix<ElementType>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const
|
||||
template <typename V1ElemType>
|
||||
std::shared_ptr<const Matrix<V1ElemType>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const
|
||||
{
|
||||
return GetMatrixImpl<ElementType>(GetTensorView<ElementType>(), rowColSplitPoint);
|
||||
return GetMatrixImpl<V1ElemType>(GetTensorView<V1ElemType>(), rowColSplitPoint);
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
std::shared_ptr<Matrix<ElementType>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/)
|
||||
template <typename V1ElemType>
|
||||
std::shared_ptr<Matrix<V1ElemType>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/)
|
||||
{
|
||||
return GetMatrixImpl<ElementType>(GetWritableTensorView<ElementType>(), rowColSplitPoint);
|
||||
return GetMatrixImpl<V1ElemType>(GetWritableTensorView<V1ElemType>(), rowColSplitPoint);
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
const TensorView<ElementType>* NDArrayView::GetTensorView() const
|
||||
std::shared_ptr<const MatrixBase> NDArrayView::GetMatrixBase(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const
|
||||
{
|
||||
if (AsDataType<ElementType>() != m_dataType)
|
||||
LogicError("NDArrayView::GetTensorView: The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(m_dataType));
|
||||
|
||||
return (const TensorView<ElementType>*)(m_tensorView.get());
|
||||
switch (m_dataType)
|
||||
{
|
||||
case DataType::Float:
|
||||
return GetMatrixImpl<float>(GetTensorView<float>(), rowColSplitPoint);
|
||||
case DataType::Double:
|
||||
return GetMatrixImpl<double>(GetTensorView<double>(), rowColSplitPoint);
|
||||
case DataType::Float16:
|
||||
return GetMatrixImpl<half>(GetTensorView<half>(), rowColSplitPoint);
|
||||
default:
|
||||
LogicError("Unknown m_dataType %d", (int)m_dataType);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
TensorView<ElementType>* NDArrayView::GetWritableTensorView()
|
||||
std::shared_ptr<MatrixBase> NDArrayView::GetWritableMatrixBase(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/)
|
||||
{
|
||||
switch (m_dataType)
|
||||
{
|
||||
case DataType::Float:
|
||||
return GetMatrixImpl<float>(GetWritableTensorView<float>(), rowColSplitPoint);
|
||||
case DataType::Double:
|
||||
return GetMatrixImpl<double>(GetWritableTensorView<double>(), rowColSplitPoint);
|
||||
case DataType::Float16:
|
||||
return GetMatrixImpl<half>(GetWritableTensorView<half>(), rowColSplitPoint);
|
||||
default:
|
||||
LogicError("Unknown m_dataType %d", (int)m_dataType);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename V1ElemType>
|
||||
const TensorView<V1ElemType>* NDArrayView::GetTensorView() const
|
||||
{
|
||||
if (AsDataType<V1ElemType>() != m_dataType)
|
||||
LogicError("NDArrayView::GetTensorView: The specified ElementType %s does not match the DataType %s", typeid(V1ElemType).name(), DataTypeName(m_dataType));
|
||||
|
||||
return (const TensorView<V1ElemType>*)(m_tensorView.get());
|
||||
}
|
||||
|
||||
template <typename V1ElemType>
|
||||
TensorView<V1ElemType>* NDArrayView::GetWritableTensorView()
|
||||
{
|
||||
if (IsReadOnly())
|
||||
InvalidArgument("NDArrayView::GetWritableTensorView: Cannot get a writable TensorView from a read-only NDArrayView.");
|
||||
|
||||
return const_cast<TensorView<ElementType>*>(GetTensorView<ElementType>());
|
||||
return const_cast<TensorView<V1ElemType>*>(GetTensorView<V1ElemType>());
|
||||
}
|
||||
|
||||
NDArrayViewPtr NDArrayView::DeepClone(const DeviceDescriptor& device, bool readOnly/* = false*/) const
|
||||
|
@ -262,6 +348,13 @@ namespace CNTK
|
|||
newMatrix->AssignValuesOf(*thisMatrix);
|
||||
break;
|
||||
}
|
||||
case DataType::Float16:
|
||||
{
|
||||
auto newMatrix = newView->GetWritableMatrix<half>();
|
||||
auto thisMatrix = GetMatrix<half>();
|
||||
newMatrix->AssignValuesOf(*thisMatrix);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogicError("NDArrayView::DeepClone: Unsupported DataType %s", DataTypeName(m_dataType));
|
||||
break;
|
||||
|
@ -296,6 +389,13 @@ namespace CNTK
|
|||
destMatrix->AssignValuesOf(*sourceMatrix);
|
||||
break;
|
||||
}
|
||||
case DataType::Float16:
|
||||
{
|
||||
auto sourceMatrix = source.GetMatrix<half>();
|
||||
auto destMatrix = GetWritableMatrix<half>();
|
||||
destMatrix->AssignValuesOf(*sourceMatrix);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogicError("NDArrayView::CopyFrom: Unsupported DataType %s", DataTypeName(m_dataType));
|
||||
break;
|
||||
|
@ -313,6 +413,9 @@ namespace CNTK
|
|||
case DataType::Double:
|
||||
tensorView = new TensorView<double>(*(GetTensorView<double>()));
|
||||
break;
|
||||
case DataType::Float16:
|
||||
tensorView = new TensorView<half>(*(GetTensorView<half>()));
|
||||
break;
|
||||
default:
|
||||
LogicError("NDArrayView::Alias: Unsupported DataType %s", DataTypeName(m_dataType));
|
||||
break;
|
||||
|
@ -385,6 +488,19 @@ namespace CNTK
|
|||
tensorView = new TensorView<double>(slicedMatrixView, AsTensorViewShape(sliceViewShape));
|
||||
break;
|
||||
}
|
||||
case DataType::Float16:
|
||||
{
|
||||
auto currentMatrix = GetMatrix<half>();
|
||||
std::pair<size_t, size_t> currentMatrixDims = { currentMatrix->GetNumRows(), currentMatrix->GetNumCols() };
|
||||
std::shared_ptr<Matrix<half>> slicedMatrixView;
|
||||
if (sliceViewMatrixDims.first != currentMatrixDims.first)
|
||||
slicedMatrixView = make_shared<Matrix<half>>(currentMatrix->Reshaped(1, currentMatrix->GetNumElements()).ColumnSlice(flatBufferOffset, sliceViewShape.TotalSize()));
|
||||
else
|
||||
slicedMatrixView = make_shared<Matrix<half>>(currentMatrix->ColumnSlice(sliceMatrixColumnOffset, sliceViewMatrixDims.second));
|
||||
|
||||
tensorView = new TensorView<half>(slicedMatrixView, AsTensorViewShape(sliceViewShape));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogicError("NDArrayView::SliceView: Unsupported DataType %s", DataTypeName(m_dataType));
|
||||
break;
|
||||
|
@ -412,6 +528,9 @@ namespace CNTK
|
|||
case DataType::Double:
|
||||
tensorView = new TensorView<double>(*(GetTensorView<double>()), newTensorShape);
|
||||
break;
|
||||
case DataType::Float16:
|
||||
tensorView = new TensorView<half>(*(GetTensorView<half>()), newTensorShape);
|
||||
break;
|
||||
default:
|
||||
LogicError("NDArrayView::AsShape: Unsupported DataType %s", DataTypeName(m_dataType));
|
||||
break;
|
||||
|
@ -430,9 +549,21 @@ namespace CNTK
|
|||
return const_cast<ElementType*>(DataBuffer<ElementType>());
|
||||
}
|
||||
|
||||
// TODO: This could actually be strided?
|
||||
template <typename ElementType>
|
||||
const ElementType* NDArrayView::DataBuffer() const
|
||||
{
|
||||
return const_cast<ElementType*>(_DataBuffer<ElementType, ElementType>());
|
||||
}
|
||||
|
||||
template<>
|
||||
const float16* NDArrayView::DataBuffer<float16>() const
|
||||
{
|
||||
return const_cast<float16*>(_DataBuffer<float16, half>());
|
||||
}
|
||||
|
||||
// TODO: This could actually be strided?
|
||||
template <typename ElementType, typename V1ElemType>
|
||||
const ElementType* NDArrayView::_DataBuffer() const
|
||||
{
|
||||
if (AsDataType<ElementType>() != m_dataType)
|
||||
InvalidArgument("NDArrayView::DataBuffer: The specified ElementType '%s' does not match this NDArrayView's DataType '%s'.", typeid(ElementType).name(), DataTypeName(m_dataType));
|
||||
|
@ -441,13 +572,25 @@ namespace CNTK
|
|||
InvalidArgument("The stroage format of 'this' NDArrayView is sparse. Please use SparseDataBuffers().");
|
||||
|
||||
// First make sure that the underlying matrix is on the right device
|
||||
auto matrix = GetMatrix<ElementType>();
|
||||
auto matrix = GetMatrix<V1ElemType>();
|
||||
matrix->TransferToDeviceIfNotThere(AsCNTKImplDeviceId(m_device), true);
|
||||
return matrix->Data();
|
||||
return reinterpret_cast<ElementType*>(matrix->Data());
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
std::tuple<const ElementType *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers() const
|
||||
{
|
||||
return _SparseCSCDataBuffers<ElementType, ElementType>();
|
||||
}
|
||||
|
||||
template <>
|
||||
std::tuple<const float16 *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<float16>() const
|
||||
{
|
||||
return _SparseCSCDataBuffers<float16, half>();
|
||||
}
|
||||
|
||||
template <typename ElementType, typename V1ElemType>
|
||||
std::tuple<const ElementType *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::_SparseCSCDataBuffers() const
|
||||
{
|
||||
if (AsDataType<ElementType>() != m_dataType)
|
||||
InvalidArgument("NDArrayView::SparseDataBuffers: The specified ElementType '%s' does not match this NDArrayView's DataType '%s'.", typeid(ElementType).name(), DataTypeName(m_dataType));
|
||||
|
@ -458,7 +601,7 @@ namespace CNTK
|
|||
if(GetStorageFormat() != StorageFormat::SparseCSC)
|
||||
RuntimeError("The SparseCSCDataBuffers() method only supports CSC sparse format.");
|
||||
|
||||
std::shared_ptr<const Matrix<ElementType>> matrix = GetMatrix<ElementType>();
|
||||
std::shared_ptr<const Matrix<V1ElemType>> matrix = GetMatrix<V1ElemType>();
|
||||
auto matrixDims = GetMatrixDimensions(Shape());
|
||||
if (matrix->GetNumRows() != matrixDims.first)
|
||||
LogicError("The number of rows of the underlying matrix does not match the shape.");
|
||||
|
@ -470,16 +613,16 @@ namespace CNTK
|
|||
RuntimeError("NDArrayView::SparseDataBuffers: The underlying matrix of 'this' NDArrayView is not in the CSC sparse format.");
|
||||
|
||||
size_t numNonZeroValues;
|
||||
ElementType* nonZeroValues;
|
||||
V1ElemType* nonZeroValues;
|
||||
SparseIndexType* colStarts;
|
||||
SparseIndexType* rowIndices;
|
||||
if (m_device.Type() == DeviceKind::CPU)
|
||||
{
|
||||
if (sizeof(CPUSPARSE_INDEX_TYPE) != sizeof(SparseIndexType))
|
||||
LogicError("Inconsistent data type for sparse index in 'this' Value and the underlying matrix on CPU.");
|
||||
std::shared_ptr<Microsoft::MSR::CNTK::CPUSparseMatrix<ElementType>> sparseMatrix = matrix->m_CPUSparseMatrix;
|
||||
std::shared_ptr<Microsoft::MSR::CNTK::CPUSparseMatrix<V1ElemType>> sparseMatrix = matrix->m_CPUSparseMatrix;
|
||||
numNonZeroValues = sparseMatrix->NzCount();
|
||||
nonZeroValues = static_cast<ElementType *>(sparseMatrix->NzValues());
|
||||
nonZeroValues = static_cast<V1ElemType *>(sparseMatrix->NzValues());
|
||||
colStarts = static_cast<SparseIndexType *>(sparseMatrix->ColLocation());
|
||||
rowIndices = static_cast<SparseIndexType *>(sparseMatrix->RowLocation());
|
||||
}
|
||||
|
@ -487,9 +630,9 @@ namespace CNTK
|
|||
{
|
||||
if (sizeof(GPUSPARSE_INDEX_TYPE) != sizeof(SparseIndexType))
|
||||
LogicError("Inconsistent data type for sparse index in 'this' Value and the underlying matrix on GPU.");
|
||||
std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<ElementType>> sparseMatrix = matrix->m_GPUSparseMatrix;
|
||||
std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<V1ElemType>> sparseMatrix = matrix->m_GPUSparseMatrix;
|
||||
numNonZeroValues = sparseMatrix->NzCount();
|
||||
nonZeroValues = static_cast<ElementType *>(sparseMatrix->NzValues());
|
||||
nonZeroValues = static_cast<V1ElemType *>(sparseMatrix->NzValues());
|
||||
colStarts = static_cast<SparseIndexType *>(sparseMatrix->ColLocation());
|
||||
rowIndices = static_cast<SparseIndexType *>(sparseMatrix->RowLocation());
|
||||
}
|
||||
|
@ -498,11 +641,23 @@ namespace CNTK
|
|||
RuntimeError("NDArrayView::SparseDataBuffers: The device %S is currently not supported.",DeviceKindName(m_device.Type()));
|
||||
}
|
||||
|
||||
return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t>(nonZeroValues, colStarts, rowIndices, numNonZeroValues);
|
||||
return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t>(reinterpret_cast<ElementType*>(nonZeroValues), colStarts, rowIndices, numNonZeroValues);
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
std::tuple<const void *, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers() const
|
||||
{
|
||||
return _SparseBlockColumnDataBuffers<ElementType, ElementType>();
|
||||
}
|
||||
|
||||
template <>
|
||||
std::tuple<const void *, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<float16>() const
|
||||
{
|
||||
return _SparseBlockColumnDataBuffers<float16, half>();
|
||||
}
|
||||
|
||||
template <typename ElementType, typename V1ElemType>
|
||||
std::tuple<const void *, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::_SparseBlockColumnDataBuffers() const
|
||||
{
|
||||
if (AsDataType<ElementType>() != m_dataType)
|
||||
InvalidArgument("NDArrayView::SparseBlockColumnDataBuffers: The specified ElementType '%s' does not match this NDArrayView's DataType '%s'.", typeid(ElementType).name(), DataTypeName(m_dataType));
|
||||
|
@ -513,23 +668,23 @@ namespace CNTK
|
|||
if (GetStorageFormat() != StorageFormat::SparseBlockCol)
|
||||
RuntimeError("The SparseBlockColumnDataBuffers() method only supports sparse block column format.");
|
||||
|
||||
std::shared_ptr<const Matrix<ElementType>> matrix = GetMatrix<ElementType>();
|
||||
std::shared_ptr<const Matrix<V1ElemType>> matrix = GetMatrix<V1ElemType>();
|
||||
|
||||
size_t numBlocks;
|
||||
size_t numRows;
|
||||
size_t numCols;
|
||||
ElementType* blockValues;
|
||||
V1ElemType* blockValues;
|
||||
SparseIndexType* blockId2Col;
|
||||
SparseIndexType* col2BlockId;
|
||||
if (m_device.Type() == DeviceKind::GPU)
|
||||
{
|
||||
if (sizeof(GPUSPARSE_INDEX_TYPE) != sizeof(SparseIndexType))
|
||||
LogicError("Inconsistent data type for sparse index in 'this' Value and the underlying matrix on GPU.");
|
||||
std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<ElementType>> sparseMatrix = matrix->m_GPUSparseMatrix;
|
||||
std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<V1ElemType>> sparseMatrix = matrix->m_GPUSparseMatrix;
|
||||
numBlocks = sparseMatrix->GetBlockSize();
|
||||
numRows = sparseMatrix->GetNumRows();
|
||||
numCols = sparseMatrix->GetNumCols();
|
||||
blockValues = static_cast<ElementType *>(sparseMatrix->NzValues());
|
||||
blockValues = static_cast<V1ElemType *>(sparseMatrix->NzValues());
|
||||
blockId2Col = static_cast<SparseIndexType *>(sparseMatrix->BlockId2ColOrRow());
|
||||
col2BlockId = static_cast<SparseIndexType *>(sparseMatrix->ColOrRow2BlockId());
|
||||
}
|
||||
|
@ -539,7 +694,7 @@ namespace CNTK
|
|||
RuntimeError("NDArrayView::SparseBlockColumnDataBuffers: The device %S is currently not supported.", DeviceKindName(m_device.Type()));
|
||||
}
|
||||
|
||||
return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t, size_t, size_t>(blockValues, blockId2Col, col2BlockId, numBlocks, numRows, numCols);
|
||||
return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t, size_t, size_t>(reinterpret_cast<ElementType*>(blockValues), blockId2Col, col2BlockId, numBlocks, numRows, numCols);
|
||||
}
|
||||
|
||||
void NDArrayView::AdjustSparseBlockColumn(const SparseIndexType* cpuCol2BlockId, size_t numBlocks, bool useBlockId2Col)
|
||||
|
@ -585,6 +740,13 @@ namespace CNTK
|
|||
matrix->CollapseDataLocation();
|
||||
break;
|
||||
}
|
||||
case DataType::Float16:
|
||||
{
|
||||
auto matrix = GetMatrix<half>();
|
||||
matrix->TransferFromDeviceToDevice(matrix->GetDeviceId(), AsCNTKImplDeviceId(device), /*isBeingMoved = */ true, /*emptyTransfer =*/ false, /*updatePreferredDevice =*/ true);
|
||||
matrix->CollapseDataLocation();
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogicError("NDArrayView::ChangeDevice: Unsupported DataType %s", DataTypeName(m_dataType));
|
||||
break;
|
||||
|
@ -594,33 +756,69 @@ namespace CNTK
|
|||
}
|
||||
|
||||
template <typename ElementType>
|
||||
/*static*/ NDArrayViewPtr NDArrayView::RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device /*= DeviceDescriptor::UseDefaultDevice()*/)
|
||||
/*static*/ NDArrayViewPtr NDArrayView::RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device)
|
||||
{
|
||||
return NDArrayView::_RandomNormal<ElementType, ElementType>(shape, mean, stdDev, seed, device);
|
||||
}
|
||||
|
||||
template <>
|
||||
/*static*/ NDArrayViewPtr NDArrayView::RandomNormal<float16>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device)
|
||||
{
|
||||
return NDArrayView::_RandomNormal<float16, half>(shape, mean, stdDev, seed, device);
|
||||
}
|
||||
|
||||
template <typename ElementType, typename V1ElemType>
|
||||
/*static*/ NDArrayViewPtr NDArrayView::_RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device /*= DeviceDescriptor::UseDefaultDevice()*/)
|
||||
{
|
||||
auto matrixDims = GetMatrixDimensions(shape);
|
||||
auto randomNormalMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomGaussian(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)mean, (ElementType)stdDev, seed));
|
||||
auto tensorView = new TensorView<ElementType>(randomNormalMatrix, AsTensorViewShape(shape));
|
||||
auto randomNormalMatrix = std::make_shared<Matrix<V1ElemType>>(Matrix<V1ElemType>::RandomGaussian(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (V1ElemType)mean, (V1ElemType)stdDev, seed));
|
||||
auto tensorView = new TensorView<V1ElemType>(randomNormalMatrix, AsTensorViewShape(shape));
|
||||
|
||||
return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
/*static*/ NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/)
|
||||
/*static*/ NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device)
|
||||
{
|
||||
return NDArrayView::_RandomUniform<ElementType, ElementType>(shape, rangeBegin, rangeEnd, seed, device);
|
||||
}
|
||||
|
||||
template <>
|
||||
/*static*/ NDArrayViewPtr NDArrayView::RandomUniform<float16>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device)
|
||||
{
|
||||
return NDArrayView::_RandomUniform<float16, half>(shape, rangeBegin, rangeEnd, seed, device);
|
||||
}
|
||||
|
||||
template <typename ElementType, typename V1ElemType>
|
||||
/*static*/ NDArrayViewPtr NDArrayView::_RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/)
|
||||
{
|
||||
auto matrixDims = GetMatrixDimensions(shape);
|
||||
auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeBegin, (ElementType)rangeEnd, seed));
|
||||
auto tensorView = new TensorView<ElementType>(randomUniformMatrix, AsTensorViewShape(shape));
|
||||
auto randomUniformMatrix = std::make_shared<Matrix<V1ElemType>>(Matrix<V1ElemType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (V1ElemType)rangeBegin, (V1ElemType)rangeEnd, seed));
|
||||
auto tensorView = new TensorView<V1ElemType>(randomUniformMatrix, AsTensorViewShape(shape));
|
||||
|
||||
return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
ElementType NDArrayView::AsScalar() const
|
||||
{
|
||||
return _AsScalar<ElementType, ElementType>();
|
||||
}
|
||||
|
||||
template <>
|
||||
float16 NDArrayView::AsScalar<float16>() const
|
||||
{
|
||||
return _AsScalar<float16, half>();
|
||||
}
|
||||
|
||||
template <typename ElementType, typename V1ElemType>
|
||||
ElementType NDArrayView::_AsScalar() const
|
||||
{
|
||||
auto scalarData = this->shared_from_this();
|
||||
if (scalarData->Shape().TotalSize() != 1)
|
||||
LogicError("NDArrayView::AsScalar: The NDArrayView shaped '%S' is not a scalar.", scalarData->Shape().AsString().c_str());
|
||||
|
||||
ElementType scalar = std::numeric_limits<ElementType>::quiet_NaN();
|
||||
ElementType scalar = quiet_NaN<ElementType>();
|
||||
std::shared_ptr<const NDArrayView> cpuData;
|
||||
if (scalarData->Device() == DeviceDescriptor::CPUDevice())
|
||||
cpuData = scalarData;
|
||||
|
@ -635,6 +833,8 @@ namespace CNTK
|
|||
scalar = *(cpuData->DataBuffer<float>());
|
||||
else if (scalarData->GetDataType() == DataType::Double)
|
||||
scalar = static_cast<ElementType>(*(cpuData->DataBuffer<double>()));
|
||||
else if (scalarData->GetDataType() == DataType::Float16)
|
||||
scalar = static_cast<ElementType>(*(cpuData->DataBuffer<float16>()));
|
||||
else
|
||||
LogicError("NDArrayView::AsScalar: Unsupported DataType");
|
||||
|
||||
|
@ -652,35 +852,44 @@ namespace CNTK
|
|||
// Explicit template instantiations
|
||||
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
|
||||
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
|
||||
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float16>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
|
||||
|
||||
template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
|
||||
template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<double>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
|
||||
template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float16>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
|
||||
|
||||
template CNTK_API const float* NDArrayView::DataBuffer<float>() const;
|
||||
template CNTK_API const double* NDArrayView::DataBuffer<double>() const;
|
||||
template CNTK_API const float16* NDArrayView::DataBuffer<float16>() const;
|
||||
|
||||
template CNTK_API const TensorView<float>* NDArrayView::GetTensorView<float>() const;
|
||||
template CNTK_API const TensorView<double>* NDArrayView::GetTensorView<double>() const;
|
||||
template CNTK_API const TensorView<half>* NDArrayView::GetTensorView<half>() const;
|
||||
|
||||
template CNTK_API std::tuple<const float*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<float>() const;
|
||||
template CNTK_API std::tuple<const double*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<double>() const;
|
||||
template CNTK_API std::tuple<const float16*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<float16>() const;
|
||||
|
||||
template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<float>() const;
|
||||
template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<double>() const;
|
||||
template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<float16>() const;
|
||||
|
||||
template CNTK_API float* NDArrayView::WritableDataBuffer<float>();
|
||||
template CNTK_API double* NDArrayView::WritableDataBuffer<double>();
|
||||
template CNTK_API float16* NDArrayView::WritableDataBuffer<float16>();
|
||||
|
||||
template std::shared_ptr<const Matrix<float>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
|
||||
template std::shared_ptr<const Matrix<double>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
|
||||
template std::shared_ptr<const Matrix<half>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
|
||||
|
||||
template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix<float>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
|
||||
template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix<double>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
|
||||
template std::shared_ptr<Matrix<half>> NDArrayView::GetWritableMatrix<half>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
|
||||
template TensorView<float>* NDArrayView::GetWritableTensorView<float>();
|
||||
template TensorView<double>* NDArrayView::GetWritableTensorView<double>();
|
||||
template TensorView<half>* NDArrayView::GetWritableTensorView<half>();
|
||||
|
||||
template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template float NDArrayView::AsScalar<float>() const;
|
||||
template double NDArrayView::AsScalar<double>() const;
|
||||
template float16 NDArrayView::AsScalar<float16>() const;
|
||||
}
|
||||
|
|
|
@ -53,8 +53,14 @@ namespace CNTK
|
|||
}
|
||||
else
|
||||
{
|
||||
// batch normalization on FP16 requires 32-bit scale/bias/mean/variance, so specialize that case
|
||||
bool batchNormSpecialCase =
|
||||
(op == PrimitiveOpType::BatchNormalization) &&
|
||||
(outputDataType == DataType::Float16) &&
|
||||
(inputDataType == DataType::Float);
|
||||
|
||||
// The DataType of all operands should match except for Constants where we allow coercion
|
||||
if ((inputDataType != DataType::Unknown) && (inputDataType != outputDataType) && !input.IsConstant())
|
||||
if ((inputDataType != DataType::Unknown) && (inputDataType != outputDataType) && !input.IsConstant() && !batchNormSpecialCase)
|
||||
InvalidArgument("Primitive op '%S' passed operands '%S' with different DataTypes '%s' and '%s'.",
|
||||
PrimitiveOpTypeName(op).c_str(), NamedListString(inputs).c_str(), DataTypeName(outputDataType), DataTypeName(inputDataType));
|
||||
}
|
||||
|
@ -70,9 +76,20 @@ namespace CNTK
|
|||
for (auto& input : inputs)
|
||||
{
|
||||
if ((input.GetDataType() == DataType::Unknown) && (input.IsConstant() || input.IsParameter()))
|
||||
{
|
||||
// batch normalization on FP16 requires 32-bit scale/bias/mean/variance, so specialize that case
|
||||
if ((op == PrimitiveOpType::BatchNormalization) &&
|
||||
(outputDataType == DataType::Float16))
|
||||
{
|
||||
input.m_dataFields->m_dataType = DataType::Float;
|
||||
}
|
||||
else
|
||||
{
|
||||
input.m_dataFields->m_dataType = outputDataType;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return outputDataType;
|
||||
}
|
||||
|
@ -235,6 +252,10 @@ namespace CNTK
|
|||
else
|
||||
{
|
||||
DataType outputDataType = GetOutputDataType(m_op, m_inputs, true);
|
||||
|
||||
if (m_op == PrimitiveOpType::Cast)
|
||||
outputDataType = static_cast<DataType>(m_attributes[PrimitiveFunction::AttributeNameNewDataType].Value<int>());
|
||||
|
||||
std::vector<Axis> outputDynamicAxes = GetOutputDynamicAxes(m_op, m_inputs, this, m_attributes);
|
||||
bool needsGradient = std::any_of(m_inputs.begin(), m_inputs.end(), [](const Variable& input) { return input.NeedsGradient(); });
|
||||
|
||||
|
@ -333,6 +354,7 @@ namespace CNTK
|
|||
case PrimitiveOpType::ELU:
|
||||
case PrimitiveOpType::StableSigmoid:
|
||||
case PrimitiveOpType::ConstantOp:
|
||||
case PrimitiveOpType::Cast:
|
||||
assert(m_inputs.size() == 1);
|
||||
outputShape = UnaryElementwiseOpOutputShape(m_inputs[0].Shape());
|
||||
break;
|
||||
|
|
|
@ -116,6 +116,7 @@ namespace CNTK
|
|||
{PrimitiveOpType::TopK, L"TopK"},
|
||||
{PrimitiveOpType::ConstantOp, L"ConstantOp"},
|
||||
{PrimitiveOpType::Squeeze, L"Squeeze"},
|
||||
{PrimitiveOpType::Cast, L"Cast" },
|
||||
};
|
||||
|
||||
inline const std::wstring& PrimitiveOpTypeName(PrimitiveOpType opType)
|
||||
|
|
|
@ -100,6 +100,7 @@ namespace CNTK
|
|||
Squeeze = 88,
|
||||
ConstantOp = 89,
|
||||
LatticeSequenceWithSoftmax = 90,
|
||||
Cast = 91,
|
||||
// New op types should only be appended to the end of this list
|
||||
UnknownOP
|
||||
// and UnknownOP should always be last.
|
||||
|
|
|
@ -231,13 +231,17 @@ namespace CNTK
|
|||
return DictionaryValue::Type(type);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void CopyData(const NDArrayView& src, RepeatedField<T>* dst)
|
||||
template <typename SrcT, typename DstT=SrcT>
|
||||
static void CopyData(const NDArrayView& src, RepeatedField<DstT>* dst)
|
||||
{
|
||||
auto size = src.Shape().TotalSize();
|
||||
dst->Resize((int)size, T());
|
||||
const T* buffer = src.DataBuffer<T>();
|
||||
memcpy(dst->mutable_data(), buffer, (int)size * sizeof(T));
|
||||
dst->Resize((int)size, DstT());
|
||||
const SrcT* buffer = src.DataBuffer<SrcT>();
|
||||
if (std::is_same<SrcT, DstT>::value)
|
||||
memcpy(dst->mutable_data(), buffer, (int)size * sizeof(DstT));
|
||||
else
|
||||
for (size_t i = 0; i < size; i++)
|
||||
dst->mutable_data()[i] = (DstT)buffer[i];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -250,32 +254,44 @@ namespace CNTK
|
|||
{
|
||||
auto value = buffer[i];
|
||||
if (tSize <= sizeof(uint32))
|
||||
output.WriteLittleEndian32(Encode<T, uint32>(value));
|
||||
{
|
||||
output.WriteLittleEndian32(Encode<T, uint32>((float)value));
|
||||
}
|
||||
else
|
||||
{
|
||||
output.WriteLittleEndian64(Encode<T, uint64>(value));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <typename SrcT, typename DstT = SrcT>
|
||||
static bool ReadData(RenewableCodedStream& input, NDArrayView& dst)
|
||||
{
|
||||
auto size = dst.Shape().TotalSize();
|
||||
T* buffer = dst.WritableDataBuffer<T>();
|
||||
DstT* buffer = dst.WritableDataBuffer<DstT>();
|
||||
for (auto i = 0; i < size; i++)
|
||||
{
|
||||
if (!input.Read<T>(buffer+i))
|
||||
SrcT value;
|
||||
if (!input.Read<SrcT>(&value))
|
||||
return false;
|
||||
buffer[i] = (DstT)value;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void CopyData(const RepeatedField<T>& src, NDArrayView* dst)
|
||||
template <typename SrcT, typename DstT = SrcT>
|
||||
static void CopyData(const RepeatedField<SrcT>& src, NDArrayView* dst)
|
||||
{
|
||||
auto size = src.size();
|
||||
assert(size == dst->Shape().TotalSize());;
|
||||
T* buffer = dst->WritableDataBuffer<T>();
|
||||
memcpy(buffer, src.data(), size * sizeof(T));
|
||||
DstT* buffer = dst->WritableDataBuffer<DstT>();
|
||||
if (std::is_same<SrcT, DstT>::value)
|
||||
memcpy(buffer, src.data(), size * sizeof(SrcT));
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < size; i++)
|
||||
buffer[i] = (DstT)src.data()[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -312,6 +328,10 @@ namespace CNTK
|
|||
{
|
||||
CopyData<double>(src, dst->mutable_double_values()->mutable_value());
|
||||
}
|
||||
else if (src.GetDataType() == DataType::Float16)
|
||||
{
|
||||
CopyData<float16, float>(src, dst->mutable_float_values()->mutable_value());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -328,6 +348,10 @@ namespace CNTK
|
|||
{
|
||||
WriteData<double>(src, output);
|
||||
}
|
||||
else if (src.GetDataType() == DataType::Float16)
|
||||
{
|
||||
WriteData<float16>(src, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -350,6 +374,11 @@ namespace CNTK
|
|||
if (!ReadData<double>(wrapper, dst))
|
||||
return false;
|
||||
}
|
||||
else if (dst.GetDataType() == DataType::Float16)
|
||||
{
|
||||
if (!ReadData<float, float16>(wrapper, dst))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -446,6 +475,13 @@ namespace CNTK
|
|||
else
|
||||
m_arrayViews.push_back({ dst, nullptr });
|
||||
}
|
||||
else if(dataType == DataType::Float16)
|
||||
{
|
||||
if (src.float_values().value().size() == shape->TotalSize())
|
||||
CopyData<float, float16>(src.float_values().value(), dst);
|
||||
else
|
||||
m_arrayViews.push_back({ dst, nullptr });
|
||||
}
|
||||
return dst;
|
||||
}
|
||||
|
||||
|
|
|
@ -37,6 +37,7 @@ namespace CNTK
|
|||
const std::wstring learningRateScheduleKey = L"learnig_rate_schedule";
|
||||
const std::wstring smoothedGradientsKey = L"smoothed_gradients";
|
||||
const std::wstring noiseInjectionSeedKey = L"noise_injection_seed";
|
||||
const std::wstring masterParameterUpdatedKey = L"master_parameter_updated";
|
||||
const std::wstring smoothedCountKey = L"smoothed_count";
|
||||
const std::wstring stateKey = L"state";
|
||||
const std::wstring rngSeedKey = L"rng_seed";
|
||||
|
|
|
@ -51,6 +51,10 @@ namespace CNTK
|
|||
combinedFunctionArgs = m_model->Outputs();
|
||||
|
||||
combinedFunctionArgs.push_back(m_lossFunction);
|
||||
|
||||
if (m_lossFunction->Output().GetDataType() == DataType::Float16)
|
||||
fprintf(stderr, "WARNING: using Float16 for loss function may cause overflow, please cast to float");
|
||||
|
||||
if (!m_lossFunction->Output().DynamicAxes().empty())
|
||||
{
|
||||
m_aggregatedLossFunction = ReduceSum(lossFunction, Axis::AllAxes(), L"aggregateLoss");
|
||||
|
@ -226,6 +230,9 @@ namespace CNTK
|
|||
// Gradients are not existing.
|
||||
for (const auto& parameter : m_learnerParameters)
|
||||
gradients[parameter] = nullptr;
|
||||
|
||||
trainingLoss = MakeSharedObject<NDArrayView>(0, (m_aggregatedLossFunction ? m_aggregatedLossFunction->Output().GetDataType() : DataType::Float), NDShape{}, computeDevice);
|
||||
evalCriterion = MakeSharedObject<NDArrayView>(0, (m_aggregatedEvaluationFunction ? m_aggregatedEvaluationFunction->Output().GetDataType() : DataType::Float), NDShape{}, computeDevice);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -363,8 +370,10 @@ namespace CNTK
|
|||
|
||||
if (m_aggregatedLossFunction->Output().GetDataType() == DataType::Float)
|
||||
m_rootGradientValue->Data()->SetValue(1.0f);
|
||||
else
|
||||
else if (m_aggregatedLossFunction->Output().GetDataType() == DataType::Double)
|
||||
m_rootGradientValue->Data()->SetValue(1.0);
|
||||
else
|
||||
m_rootGradientValue->Data()->SetValue(half(1.0));
|
||||
|
||||
for (const auto& parameter : m_learnerParameters)
|
||||
parameterGradients[parameter] = nullptr;
|
||||
|
@ -541,7 +550,7 @@ namespace CNTK
|
|||
return m_parameterLearners->ParameterLearners().front()->TotalNumberOfSamplesSeen();
|
||||
default:
|
||||
//should not be here; whenever a new data unit is defined, there should be a new case in this function.
|
||||
LogicError("Unsupported data unit: %d", unit);
|
||||
LogicError("Unsupported data unit: %d", (int)unit);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -754,7 +754,7 @@ namespace CNTK
|
|||
if (mask != nullptr)
|
||||
Value::GetSequenceStartsAndLengths(mask, sequenceBeginIndices, sequenceLengths, numDynamicAxes);
|
||||
|
||||
bool hasTruncatedSequences = std::find_if(sequenceBeginIndices.begin(), sequenceBeginIndices.end(), [](const int& val) { return (val < 0); }) != sequenceBeginIndices.end();
|
||||
bool hasTruncatedSequences = std::find_if(sequenceBeginIndices.begin(), sequenceBeginIndices.end(), [](const ptrdiff_t& val) { return (val < 0); }) != sequenceBeginIndices.end();
|
||||
|
||||
auto layout = std::make_shared<MBLayout>();
|
||||
std::vector<std::pair<size_t, size_t>> placement;
|
||||
|
@ -1178,12 +1178,15 @@ namespace CNTK
|
|||
|
||||
template std::pair<std::shared_ptr<const Matrix<float>>, MBLayoutPtr> Utils::GetCNTKImplMatrixAndMBLayoutFromValueObject<float>(const Variable& var, const ValuePtr& value, NDShape* inferredVarShape);
|
||||
template std::pair<std::shared_ptr<const Matrix<double>>, MBLayoutPtr> Utils::GetCNTKImplMatrixAndMBLayoutFromValueObject<double>(const Variable& var, const ValuePtr& value, NDShape* inferredVarShape);
|
||||
template std::pair<std::shared_ptr<const Matrix<half>>, MBLayoutPtr> Utils::GetCNTKImplMatrixAndMBLayoutFromValueObject<half>(const Variable& var, const ValuePtr& value, NDShape* inferredVarShape);
|
||||
|
||||
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(const NDShape& sampleShape, const std::vector<Axis>& sampleDynamicAxes, const Matrix<float>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
|
||||
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(const NDShape& sampleShape, const std::vector<Axis>& sampleDynamicAxes, const Matrix<double>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
|
||||
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<half>(const NDShape& sampleShape, const std::vector<Axis>& sampleDynamicAxes, const Matrix<half>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
|
||||
|
||||
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(const Variable& var, const ComputationNodeBasePtr& computationNode, const Matrix<float>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
|
||||
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(const Variable& var, const ComputationNodeBasePtr& computationNode, const Matrix<double>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
|
||||
template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<half>(const Variable& var, const ComputationNodeBasePtr& computationNode, const Matrix<half>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
|
||||
|
||||
void Accumulator::Update(const ValuePtr& delta, const DeviceDescriptor& device)
|
||||
{
|
||||
|
@ -1206,8 +1209,10 @@ namespace CNTK
|
|||
|
||||
if (delta->GetDataType() == DataType::Float)
|
||||
Data()->GetWritableTensorView<float>()->AddCopyOf(*delta->Data()->GetTensorView<float>());
|
||||
else
|
||||
else if(delta->GetDataType() == DataType::Double)
|
||||
Data()->GetWritableTensorView<double>()->AddCopyOf(*delta->Data()->GetTensorView<double>());
|
||||
else
|
||||
RuntimeError("Unexpected data type in accumulator");
|
||||
|
||||
if (copied && m_numUpdates != 0)
|
||||
RuntimeError("Accumulation values are created when accumulated num updates not zero");
|
||||
|
@ -1228,8 +1233,10 @@ namespace CNTK
|
|||
|
||||
if (GetDataType() == DataType::Float)
|
||||
Data()->SetValue(0.0f);
|
||||
else
|
||||
else if (GetDataType() == DataType::Double)
|
||||
Data()->SetValue(0.0);
|
||||
else
|
||||
RuntimeError("Unsupported data type in Accumulator");
|
||||
}
|
||||
|
||||
std::wstring DynamicAxesAsString(const std::vector<Axis>& axes, bool rowMajor)
|
||||
|
|
|
@ -742,9 +742,18 @@ namespace CNTK
|
|||
template <typename T> //T can be Variable or StreamInfo
|
||||
static bool IsAtSweepEnd(const std::unordered_map<T, MinibatchData>& arguments)
|
||||
{
|
||||
if (arguments.empty()) return true;
|
||||
|
||||
return std::any_of(arguments.begin(), arguments.end(), [](const std::pair<const T, MinibatchData>& kv)
|
||||
{
|
||||
return kv.second.sweepEnd;
|
||||
});
|
||||
}
|
||||
|
||||
// half is V1 ElemType, so specialize here instead of in CNTKLibrary.h
|
||||
template<>
|
||||
inline DataType AsDataType<half>()
|
||||
{
|
||||
return DataType::Float16;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,7 +108,7 @@ namespace CNTK
|
|||
}
|
||||
|
||||
colStarts[numCSCCols - 1] = (SparseIndexType)(nonZeroValues.size());
|
||||
NDArrayViewPtr deviceValueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly);
|
||||
NDArrayViewPtr deviceValueData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly);
|
||||
return MakeSharedObject<Value>(deviceValueData, deviceValueMask);
|
||||
}
|
||||
|
||||
|
@ -229,18 +229,7 @@ namespace CNTK
|
|||
auto totalNumNonZeroValues = nonZeroValues.size() / DataTypeSize(dataType);
|
||||
colStarts.push_back(totalNumNonZeroValues);
|
||||
|
||||
switch (dataType)
|
||||
{
|
||||
case DataType::Float:
|
||||
// TODO: In case of sparse we can directly create on target device
|
||||
valueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), (float*)nonZeroValues.data(), totalNumNonZeroValues, device, readOnly);
|
||||
break;
|
||||
case DataType::Double:
|
||||
valueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), (double*)nonZeroValues.data(), totalNumNonZeroValues, device, readOnly);
|
||||
break;
|
||||
default:
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
valueData = MakeSharedObject<NDArrayView>(dataType, valueDataShape, colStarts.data(), rowIndices.data(), (void*)nonZeroValues.data(), totalNumNonZeroValues, device, readOnly);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -372,7 +361,7 @@ namespace CNTK
|
|||
/*static*/ ValuePtr Value::CreateSequence(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/)
|
||||
{
|
||||
auto sequenceShape = sampleShape.AppendShape({sequenceLength});
|
||||
auto sequenceData = MakeSharedObject<NDArrayView>(sequenceShape, colStarts, rowIndices, nonZeroValues, numNonZeroValues, device, readOnly);
|
||||
auto sequenceData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), sequenceShape, colStarts, rowIndices, nonZeroValues, numNonZeroValues, device, readOnly);
|
||||
return Create(sampleShape, {sequenceData}, {sequenceStartFlag}, device, readOnly, false);
|
||||
}
|
||||
|
||||
|
@ -683,6 +672,9 @@ namespace CNTK
|
|||
case DataType::Double:
|
||||
valueObject = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout(m_sampleShape, m_sampleDynamicAxes, *(m_packedData->GetMatrix<double>()), m_packedDataLayout, m_isReadOnly);
|
||||
break;
|
||||
case DataType::Float16:
|
||||
valueObject = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout(m_sampleShape, m_sampleDynamicAxes, *(m_packedData->GetMatrix<half>()), m_packedDataLayout, m_isReadOnly);
|
||||
break;
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(dataType));
|
||||
}
|
||||
|
@ -728,7 +720,7 @@ namespace CNTK
|
|||
bool found = false;
|
||||
for (size_t i = 0; i < sampleSize; i++)
|
||||
{
|
||||
if (*currentp == 1)
|
||||
if (*currentp == (ElementType)1)
|
||||
{
|
||||
if (found)
|
||||
RuntimeError("CopyDenseToOneHot: Cannot convert to onehot vector; more than one non-zero value in the sample.");
|
||||
|
@ -736,7 +728,7 @@ namespace CNTK
|
|||
index = i;
|
||||
found = true;
|
||||
}
|
||||
else if (*currentp != 0)
|
||||
else if (*currentp != (ElementType)0)
|
||||
RuntimeError("CopyDenseToOneHot: Cannot convert to onehot vector; contains value other than 0/1.");
|
||||
|
||||
currentp++;
|
||||
|
@ -753,26 +745,38 @@ namespace CNTK
|
|||
// Explicit template instantiations
|
||||
template /*static*/ CNTK_API ValuePtr Value::Create<float>(const NDShape& sampleShape, const std::vector<std::vector<float>>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::Create<double>(const NDShape& sampleShape, const std::vector<std::vector<double>>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::Create<float16>(const NDShape& sampleShape, const std::vector<std::vector<float16>>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::Create<float>(const NDShape& sampleShape, const std::vector<std::vector<size_t>>& oneHotSequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::Create<double>(const NDShape& sampleShape, const std::vector<std::vector<size_t>>& oneHotSequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::Create<float16>(const NDShape& sampleShape, const std::vector<std::vector<size_t>>& oneHotSequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float>(const NDShape& sampleShape, const std::vector<float>& batchData, const DeviceDescriptor& device, bool readOnly /*= false */);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<double>(const NDShape& sampleShape, const std::vector<double>& batchData, const DeviceDescriptor& device, bool readOnly /*= false */);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float16>(const NDShape& sampleShape, const std::vector<float16>& batchData, const DeviceDescriptor& device, bool readOnly /*= false */);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float>(const NDShape& sampleShape, const std::vector<float>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly /*= false */);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<double>(const NDShape& sampleShape, const std::vector<double>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly /*= false */);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float16> (const NDShape& sampleShape, const std::vector<float16>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly /*= false */);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float>(size_t dimension, const std::vector<size_t>& batchData, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<double>(size_t dimension, const std::vector<size_t>& batchData, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float16> (size_t dimension, const std::vector<size_t>& batchData, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float>(size_t dimension, const std::vector<size_t>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<double>(size_t dimension, const std::vector<size_t>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float16>(size_t dimension, const std::vector<size_t>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float>(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<double>(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float16>(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float16* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template CNTK_API void Value::CopyVariableValueToVector<float>(const Variable& outputVariable, std::vector<std::vector<float>>& sequences);
|
||||
template CNTK_API void Value::CopyVariableValueToVector<double>(const Variable& outputVariable, std::vector<std::vector<double>>& sequences);
|
||||
template CNTK_API void Value::CopyVariableValueToVector<float16>(const Variable& outputVariable, std::vector<std::vector<float16>>& sequences);
|
||||
template CNTK_API void Value::CopyVariableValueToVector<float>(const Variable& outputVariable, std::vector<std::vector<size_t>>& sequences);
|
||||
template CNTK_API void Value::CopyVariableValueToVector<double>(const Variable& outputVariable, std::vector<std::vector<size_t>>& sequences);
|
||||
template CNTK_API void Value::CopyVariableValueToVector<float16>(const Variable& outputVariable, std::vector<std::vector<size_t>>& sequences);
|
||||
template CNTK_API std::tuple<size_t, size_t, size_t> Value::ValidateSparseCSCAndGetIndexBufferSizes<float>(const Variable& outputVariable);
|
||||
template CNTK_API std::tuple<size_t, size_t, size_t> Value::ValidateSparseCSCAndGetIndexBufferSizes<double>(const Variable& outputVariable);
|
||||
template CNTK_API std::tuple<size_t, size_t, size_t> Value::ValidateSparseCSCAndGetIndexBufferSizes<float16>(const Variable& outputVariable);
|
||||
template CNTK_API void Value::CopyVariableValueToCSCSparse<float>(size_t sequenceLength, std::vector<SparseIndexType>& colStarts, std::vector<SparseIndexType>& rowIndices, std::vector<float>& nonZeroValues, size_t& numNonZeroValues);
|
||||
template CNTK_API void Value::CopyVariableValueToCSCSparse<double>(size_t sequenceLength, std::vector<SparseIndexType>& colStarts, std::vector<SparseIndexType>& rowIndices, std::vector<double>& nonZeroValues, size_t& numNonZeroValues);
|
||||
template CNTK_API void Value::CopyVariableValueToCSCSparse<float16>(size_t sequenceLength, std::vector<SparseIndexType>& colStarts, std::vector<SparseIndexType>& rowIndices, std::vector<float16>& nonZeroValues, size_t& numNonZeroValues);
|
||||
template float Value::AsScalar<float>() const;
|
||||
template double Value::AsScalar<double>() const;
|
||||
template float16 Value::AsScalar<float16>() const;
|
||||
}
|
||||
|
|
|
@ -135,6 +135,11 @@ namespace CNTK
|
|||
m_dataFields->m_value = CreateValueFromParameterInitializer<double>(Shape(), *m_dataFields->m_valueInitializer, *m_dataFields->m_valueInitializationDevice);
|
||||
break;
|
||||
}
|
||||
case DataType::Float16:
|
||||
{
|
||||
m_dataFields->m_value = CreateValueFromParameterInitializer<half>(Shape(), *m_dataFields->m_valueInitializer, *m_dataFields->m_valueInitializationDevice);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogicError("Variable '%S' Value(): Unsupported DataType %s", AsString().c_str(), DataTypeName(GetDataType()));
|
||||
break;
|
||||
|
@ -496,7 +501,8 @@ namespace CNTK
|
|||
DataType dataType = DataType(dict[dataTypeKey].Value<std::size_t>());
|
||||
if (dataType != DataType::Unknown &&
|
||||
dataType != DataType::Float &&
|
||||
dataType != DataType::Double)
|
||||
dataType != DataType::Double &&
|
||||
dataType != DataType::Float16)
|
||||
{
|
||||
LogicError("Unexpected variable datatype '%ls':'%u' (%s).",
|
||||
dataTypeKey.c_str(),
|
||||
|
|
|
@ -23,6 +23,7 @@ message NDArrayView {
|
|||
Unknown = 0;
|
||||
Float = 1;
|
||||
Double = 2;
|
||||
Float16 = 4;
|
||||
}
|
||||
|
||||
enum StorageFormat {
|
||||
|
|
|
@ -1497,7 +1497,7 @@ FunctionPtr ONNXToCNTKHelper::CreateFunction(const Node *node, const std::vector
|
|||
CNTK::PaddingMode cntkPaddingMode;
|
||||
double cntkConstantValue = 0.0;
|
||||
auto mode = GetNamedAttributeAsString(node, "mode", "constant");
|
||||
std::transform(mode.begin(), mode.end(), mode.begin(), ::tolower);
|
||||
std::transform(mode.begin(), mode.end(), mode.begin(), [](char v) { return (char)::tolower(v); });
|
||||
if (mode == "constant")
|
||||
cntkPaddingMode = CNTK::PaddingMode::CONSTANTPAD;
|
||||
else if (mode == "reflect")
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
#define WRITE_BUFFER_SIZE (1024 * 1024)
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include "half.hpp"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -978,9 +979,11 @@ template <class ElemType>
|
|||
|
||||
template vector<float> File::LoadMatrixFromTextFile<float> (const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
|
||||
template vector<double> File::LoadMatrixFromTextFile<double>(const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
|
||||
template vector<half> File::LoadMatrixFromTextFile<half>(const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
|
||||
|
||||
template vector<float> File::LoadMatrixFromStringLiteral<float> (const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
|
||||
template vector<double> File::LoadMatrixFromStringLiteral<double>(const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
|
||||
template vector<half> File::LoadMatrixFromStringLiteral<half>(const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
|
||||
|
||||
#ifndef CNTK_COMPONENT_VERSION
|
||||
#error CNTK_COMPONENT_VERSION must be set
|
||||
|
|
|
@ -296,7 +296,7 @@ struct utf8 : std::string
|
|||
} // empty string
|
||||
std::vector<char> buf(3 * len + 1); // max: 1 wchar => up to 3 mb chars
|
||||
// ... TODO: this fill() should be unnecessary (a 0 is appended)--but verify
|
||||
std::fill(buf.begin(), buf.end(), 0);
|
||||
std::fill(buf.begin(), buf.end(), (char)0);
|
||||
int rc = WideCharToMultiByte(CP_UTF8, 0, p.c_str(), (int) len,
|
||||
&buf[0], (int) buf.size(), NULL, NULL);
|
||||
if (rc == 0)
|
||||
|
@ -335,7 +335,7 @@ static inline std::string wcstombs(const std::wstring& p) // output: MBCS
|
|||
{
|
||||
size_t len = p.length();
|
||||
std::vector<char> buf(2 * len + 1); // max: 1 wchar => 2 mb chars
|
||||
std::fill(buf.begin(), buf.end(), 0);
|
||||
std::fill(buf.begin(), buf.end(), (char)0);
|
||||
::wcstombs(&buf[0], p.c_str(), 2 * len + 1);
|
||||
return std::string(&buf[0]);
|
||||
}
|
||||
|
|
|
@ -158,13 +158,11 @@ public:
|
|||
// put operator for basic types
|
||||
template <typename T>
|
||||
File& operator<<(T val)
|
||||
{
|
||||
{
|
||||
if (IsTextBased())
|
||||
fputText(m_file, val);
|
||||
else
|
||||
fput(m_file, val);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
File& operator<<(const std::wstring& val);
|
||||
|
|
|
@ -1158,8 +1158,10 @@ public:
|
|||
void release(bool cpumode);
|
||||
void setloglls(const Microsoft::MSR::CNTK::Matrix<float>& loglls);
|
||||
void setloglls(const Microsoft::MSR::CNTK::Matrix<double>& loglls);
|
||||
void setloglls(const Microsoft::MSR::CNTK::Matrix<half>& loglls);
|
||||
void getgamma(Microsoft::MSR::CNTK::Matrix<float>& loglls);
|
||||
void getgamma(Microsoft::MSR::CNTK::Matrix<double>& loglls);
|
||||
void getgamma(Microsoft::MSR::CNTK::Matrix<half>& loglls);
|
||||
};
|
||||
|
||||
// forward-backward function
|
||||
|
|
|
@ -2099,7 +2099,7 @@ static inline std::string wcstombs(const std::wstring& p) // output: MBCS
|
|||
{
|
||||
size_t len = p.length();
|
||||
vector<char> buf(2 * len + 1); // max: 1 wchar => 2 mb chars
|
||||
fill(buf.begin(), buf.end(), 0);
|
||||
fill(buf.begin(), buf.end(), (char)0);
|
||||
::wcstombs(&buf[0], p.c_str(), 2 * len + 1);
|
||||
return std::string(&buf[0]);
|
||||
}
|
||||
|
|
|
@ -47,6 +47,12 @@ vector<MemRequestInfo<double>>& MatrixPool::GetMemRequestInfoVec<double>()
|
|||
return m_memRequestInfoDoubleVec;
|
||||
}
|
||||
|
||||
template <>
|
||||
vector<MemRequestInfo<half>>& MatrixPool::GetMemRequestInfoVec<half>()
|
||||
{
|
||||
return m_memRequestInfoHalfVec;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// construction
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -129,6 +135,8 @@ void ComputationNetwork::SaveToFileImpl(const wstring& fileName, const FileOptio
|
|||
precision = ElemTypeName<float>();
|
||||
else if (nodePtr->Is<ComputationNode<double>>())
|
||||
precision = ElemTypeName<double>();
|
||||
else if (nodePtr->Is<ComputationNode<half>>())
|
||||
precision = ElemTypeName<half>();
|
||||
else LogicError("Unexpected node type.");
|
||||
fstream << precision;
|
||||
#endif
|
||||
|
@ -229,7 +237,7 @@ void ComputationNetwork::ReadPersistableParameters(size_t modelVersion, File& fs
|
|||
{
|
||||
wstring precision;
|
||||
if (modelVersion >= CNTK_MODEL_VERSION_7)
|
||||
fstream >> precision; // "float" or "double"; default is "" meaning <ElemType> as passed in from outside
|
||||
fstream >> precision; // "float" or "double" or "half"; default is "" meaning <ElemType> as passed in from outside
|
||||
|
||||
wstring opName, nodeName;
|
||||
fstream >> opName >> nodeName;
|
||||
|
@ -241,6 +249,8 @@ void ComputationNetwork::ReadPersistableParameters(size_t modelVersion, File& fs
|
|||
node = ComputationNetworkBuilder<float>::NewNode(opName, m_deviceId, nodeName);
|
||||
else if (precision == L"double")
|
||||
node = ComputationNetworkBuilder<double>::NewNode(opName, m_deviceId, nodeName);
|
||||
else if (precision == L"half")
|
||||
node = ComputationNetworkBuilder<half>::NewNode(opName, m_deviceId, nodeName);
|
||||
else if (precision == L"") // old file format: default to <ElemType>
|
||||
node = ComputationNetworkBuilder<ElemType>::NewNode(opName, m_deviceId, nodeName);
|
||||
else
|
||||
|
@ -434,9 +444,10 @@ void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& n
|
|||
{
|
||||
randomSeed += GetRandomSeedOffset();
|
||||
if (TryPostInitParameters<float> (node, initString, initValue, randomSeed, initOnCPUOnly) ||
|
||||
TryPostInitParameters<double>(node, initString, initValue, randomSeed, initOnCPUOnly))
|
||||
TryPostInitParameters<double>(node, initString, initValue, randomSeed, initOnCPUOnly) ||
|
||||
TryPostInitParameters<half> (node, initString, initValue, randomSeed, initOnCPUOnly))
|
||||
return;
|
||||
LogicError("InitLearnableParameters: Input node is not a LearnableParameter<float or double>");
|
||||
LogicError("InitLearnableParameters: Input node is not a LearnableParameter<float or double or half>");
|
||||
}
|
||||
|
||||
// non-static version needed because it accesses m_randomSeedOffset
|
||||
|
@ -666,6 +677,9 @@ void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net,
|
|||
auto noded = dynamic_pointer_cast<ConvolutionNode<double>>(*nodeIter);
|
||||
if (noded)
|
||||
noded->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
|
||||
auto nodeh = dynamic_pointer_cast<ConvolutionNode<half>>(*nodeIter);
|
||||
if (nodeh)
|
||||
nodeh->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1233,7 +1247,7 @@ void ComputationNetwork::SaveToDbnFile(ComputationNetworkPtr net, const std::wst
|
|||
auto GetAllPriorNodes = [](ComputationNodeBasePtr node)->bool
|
||||
{
|
||||
std::wstring lowerName = node->GetName();
|
||||
std::transform(lowerName.begin(), lowerName.end(), lowerName.begin(), ::tolower);
|
||||
std::transform(lowerName.begin(), lowerName.end(), lowerName.begin(), [](wchar_t v) { return (wchar_t)::tolower(v); });
|
||||
|
||||
return node->OperationName() == OperationNameOf(LearnableParameter) && (lowerName.find(L"prior") != wstring::npos);
|
||||
};
|
||||
|
@ -1536,6 +1550,15 @@ template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net,
|
|||
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
|
||||
template void ComputationNetwork::SaveToDbnFile<double>(ComputationNetworkPtr net, const std::wstring& fileName) const;
|
||||
|
||||
template void ComputationNetwork::InitLearnableParametersWithBilinearFill<half>(const ComputationNodeBasePtr& node, size_t kernelWidth, size_t kernelHeight);
|
||||
template void ComputationNetwork::Read<half>(const wstring& fileName);
|
||||
template void ComputationNetwork::ReadPersistableParameters<half>(size_t modelVersion, File& fstream, bool create);
|
||||
template void ComputationNetwork::PerformSVDecomposition<half>(const map<wstring, float>& SVDConfig, size_t alignedsize);
|
||||
template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<half>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
|
||||
template void ComputationNetwork::SetSeqParam<half>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
|
||||
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
|
||||
template void ComputationNetwork::SaveToDbnFile<half>(ComputationNetworkPtr net, const std::wstring& fileName) const;
|
||||
|
||||
// register ComputationNetwork with the ScriptableObject system
|
||||
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNetwork> registerComputationNetwork(L"ComputationNetwork");
|
||||
|
||||
|
|
|
@ -1309,10 +1309,12 @@ typedef ComputationNetwork::ComputationNetworkPtr ComputationNetworkPtr;
|
|||
template <typename ElemType> static inline const wchar_t* ElemTypeName();
|
||||
template <> /*static*/ inline const wchar_t* ElemTypeName<float>() { return L"float"; }
|
||||
template <> /*static*/ inline const wchar_t* ElemTypeName<double>() { return L"double"; }
|
||||
template <> /*static*/ inline const wchar_t* ElemTypeName<half>() { return L"half"; }
|
||||
|
||||
// The following emits the class and enables the BaseMatrix<double> to be available (used by EvalDll)
|
||||
// The corresponding Matrix<float> is emitted in the SetDeviceId function above.
|
||||
template class Matrix<double>;
|
||||
template class Matrix<half>;
|
||||
|
||||
// TODOs:
|
||||
// - automatic inference of time window w.r.t. delay nodes (and related nodes such as a temporal pooling)
|
||||
|
|
|
@ -226,9 +226,10 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
|
|||
|
||||
// this is used in V2
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape)
|
||||
template <class ValueType>
|
||||
shared_ptr<ComputationNode<ValueType>> ComputationNetworkBuilder<ElemType>::TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, tensorShape));
|
||||
return net.AddNodeToNetWithElemType(New<LearnableParameter<ValueType>>(net.GetDeviceId(), paramName, tensorShape));
|
||||
}
|
||||
|
||||
// TODO: change these to take an actual object instead of a name for dynamicAxis
|
||||
|
@ -964,5 +965,17 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Batch
|
|||
|
||||
template class ComputationNetworkBuilder<float>;
|
||||
template class ComputationNetworkBuilder<double>;
|
||||
template class ComputationNetworkBuilder<half>;
|
||||
|
||||
// V2 allows mixed precision
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::TypedCreateLearnableParameter<float>(const std::wstring& paramName, const TensorShape& tensorShape);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<float>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<float>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<double>::TypedCreateLearnableParameter<float>(const std::wstring& paramName, const TensorShape& tensorShape);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<double>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<float>(const std::wstring& paramName, const TensorShape& tensorShape);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);
|
||||
|
||||
}}}
|
||||
|
|
|
@ -45,7 +45,14 @@ public:
|
|||
// TODO: separate into nodes that have inputs and those that duplicate functions with input adding except just not adding inputs. Clear?
|
||||
|
||||
ComputationNodePtr CreateLearnableParameter(const std::wstring& paramName, const size_t rows, const size_t cols);
|
||||
ComputationNodePtr CreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); // V2
|
||||
shared_ptr<ComputationNode<ElemType>> CreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape)
|
||||
{
|
||||
return this->template TypedCreateLearnableParameter<ElemType>(paramName, tensorShape);
|
||||
}
|
||||
|
||||
template<class ValueType>
|
||||
shared_ptr<ComputationNode<ValueType>> TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); // V2
|
||||
|
||||
// sparse matrix size is optionally specified
|
||||
// ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0);
|
||||
ComputationNodePtr CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
|
||||
|
|
|
@ -111,7 +111,7 @@ ComputationNodeBasePtr ComputationNetwork::GetNestedNetwork(const ComputationNod
|
|||
// concurrent computation in bulk CUDA launches.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType> static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient);
|
||||
static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient);
|
||||
|
||||
ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(const std::vector<shared_ptr<SEQTraversalFlowControlNode>>& recurrentInfo, const std::list<ComputationNodeBasePtr>& allNodes /*must be in eval order*/)
|
||||
{
|
||||
|
@ -152,7 +152,7 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
|
|||
|
||||
// Extreme Tracing, part 1/4
|
||||
if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode())
|
||||
DumpNode<float>(node, /*dumpGradient=*/false) || DumpNode<double>(node, false);
|
||||
DumpNode(node, /*dumpGradient=*/false);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -188,7 +188,7 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
|
|||
|
||||
// Extreme Tracing, part 2/4
|
||||
if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode() && node->NeedsGradient())
|
||||
DumpNode<float>(node, /*dumpGradient=*/true) || DumpNode<double>(node, true);
|
||||
DumpNode(node, /*dumpGradient=*/true);
|
||||
}
|
||||
}
|
||||
/*virtual*/ void ComputationNetwork::PARTraversalFlowControlNode::RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) /*override*/
|
||||
|
@ -207,18 +207,16 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
|
|||
{
|
||||
}
|
||||
|
||||
// helper for logging. Returns false if it was not able to dynamic-cast nodep to ComputationNode<ElemType>
|
||||
template<class ElemType>
|
||||
static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
|
||||
template<typename ElemType>
|
||||
bool TypedDumpNode(shared_ptr<ComputationNode<ElemType>> node, bool dumpGradient)
|
||||
{
|
||||
let node = dynamic_pointer_cast<ComputationNode<ElemType>>(nodep);
|
||||
if (!node)
|
||||
return false;
|
||||
let dataPtr = dumpGradient ? node->GradientPtr() : node->ValuePtr();
|
||||
if (!dataPtr)
|
||||
return true; // e.g. SEQ sentinel node
|
||||
|
||||
bool concise = !(nodep->Environment().IsLogLevelNodeTrace());
|
||||
bool concise = !(node->Environment().IsLogLevelNodeTrace());
|
||||
|
||||
fprintf(stderr, "Dump --> %s%s\n", node->FormatOperationPrototype("").c_str(), dumpGradient ? " Grad" : "");
|
||||
node->WriteMinibatchWithFormatting(stderr, FrameRange(), SIZE_MAX, SIZE_MAX, false/*transpose*/, /*isCategoryLabel=*/false, /*isSparse=*/false, std::vector<std::string>(),
|
||||
|
@ -227,6 +225,18 @@ static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
|
|||
return true;
|
||||
}
|
||||
|
||||
// helper for logging. Returns false if it was not able to dump
|
||||
static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
|
||||
{
|
||||
let nodef = dynamic_pointer_cast<ComputationNode<float>>(nodep);
|
||||
if (nodef) return TypedDumpNode<float>(nodef, dumpGradient);
|
||||
let noded = dynamic_pointer_cast<ComputationNode<double>>(nodep);
|
||||
if (noded) return TypedDumpNode<double>(noded, dumpGradient);
|
||||
let nodeh = dynamic_pointer_cast<ComputationNode<half>>(nodep);
|
||||
if (nodeh) return TypedDumpNode<half>(nodeh, dumpGradient);
|
||||
return false;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// SEQTraversalFlowControlNode methods -- implements SEQ traversal (loop unrolling)
|
||||
//
|
||||
|
@ -280,7 +290,7 @@ static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
|
|||
{
|
||||
if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode())
|
||||
{
|
||||
DumpNode<float>(node, /*dumpGradient=*/false) || DumpNode<double>(node, false);
|
||||
DumpNode(node, /*dumpGradient=*/false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -321,7 +331,7 @@ static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
|
|||
{
|
||||
if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode() && node->NeedsGradient())
|
||||
{
|
||||
DumpNode<float>(node, /*dumpGradient=*/true) || DumpNode<double>(node, true);
|
||||
DumpNode(node, /*dumpGradient=*/true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
|
|
@ -85,18 +85,18 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
|
|||
|
||||
for (size_t i = 0; i < m_inputs.size(); i++)
|
||||
{
|
||||
ComputationNodePtr child = Input(i);
|
||||
if (child->m_needsGradient &&
|
||||
ComputationNodeBasePtr child = m_inputs[i];
|
||||
if (child->NeedsGradient() &&
|
||||
((childrenInThisLoop && child->IsPartOfLoop() == IsPartOfLoop()) ||
|
||||
(childrenInOuterLoop && child->IsPartOfLoop() != IsPartOfLoop()) ))
|
||||
{
|
||||
// fprintf(stderr, "Backprop: %ls %ls operation -> child %d %ls %ls\n", NodeName().c_str(), OperationName().c_str(), (int)i, child->NodeName().c_str(), child->OperationName().c_str());
|
||||
if (!m_needsGradient)
|
||||
if (!NeedsGradient())
|
||||
LogicError("%ls %ls operation has m_needsGradient set to false but children require it.", NodeName().c_str(), OperationName().c_str());
|
||||
#if DUMPOUTPUT
|
||||
fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str());
|
||||
#endif
|
||||
child->LazyZeroGradient(this); // set gradient to 0 if this is the first time
|
||||
SMART_NODE_INVOKE(ComputationNode, child, LazyZeroGradient, this); // set gradient to 0 if this is the first time
|
||||
|
||||
// If we propagate from a loop to a node that is outside the loop, we are not efficient.
|
||||
// This case is handled by SEQTraversalFlowControlNode::Backprop().
|
||||
|
@ -108,7 +108,7 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
|
|||
}
|
||||
|
||||
// before backprop, verify gradient optimization info
|
||||
Input(i)->VerifyGradientOptimization(this);
|
||||
SMART_NODE_INVOKE(ComputationNode, child, VerifyGradientOptimization, this);
|
||||
|
||||
// fprintf(stderr, "BackpropTo %d %d %ls %ls\n", (int)fr.timeIdxInSeq, (int)i, NodeName().c_str(), OperationName().c_str());
|
||||
BackpropTo(i, fr); // this computes partial wrt to the child and sums the gradient value in the child
|
||||
|
@ -753,11 +753,14 @@ template <class ElemType>
|
|||
{
|
||||
for (size_t i = 0; i < m_inputs.size(); i++)
|
||||
{
|
||||
ComputationNodePtr child = Input(i);
|
||||
if (child->m_needsGradient)
|
||||
ComputationNodeBasePtr child = m_inputs[i];
|
||||
if (child->NeedsGradient())
|
||||
{
|
||||
child->MaskMissingGradientColumnsToZero(FrameRange(child->GetMBLayout())); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0
|
||||
if (child->Gradient().HasNan("EndBackprop"))
|
||||
SMART_NODE_INVOKE(ComputationNode, child, MaskMissingGradientColumnsToZero, FrameRange(child->GetMBLayout())); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0
|
||||
|
||||
bool hasNan = false;
|
||||
SMART_NODE_INVOKE_WITH_RET(ComputationNode, child, Gradient().HasNan, hasNan, "EndBackprop");
|
||||
if (hasNan)
|
||||
{
|
||||
LogicError("%ls %ls operation unexpectedly produced NaN gradients on its input %ls.", NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str());
|
||||
}
|
||||
|
@ -1004,7 +1007,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f,
|
|||
double absSumLocal = 0;
|
||||
for (size_t j = 0; j < jend; j++) // loop over elements
|
||||
{
|
||||
absSumLocal += abs(seqData[i * istride + j * jstride]);
|
||||
absSumLocal += (double)abs(seqData[i * istride + j * jstride]);
|
||||
}
|
||||
absSum += absSumLocal;
|
||||
}
|
||||
|
@ -1136,6 +1139,7 @@ atomic_ullong TimeStamp::s_timeStampCounter = ATOMIC_VAR_INIT(0);
|
|||
|
||||
template <> map<size_t, map<size_t, shared_ptr<SingleMatrix>>> ComputationNode<float>::s_constOnes{};
|
||||
template <> map<size_t, map<size_t, shared_ptr<DoubleMatrix>>> ComputationNode<double>::s_constOnes{};
|
||||
template <> map<size_t, map<size_t, shared_ptr<HalfMatrix>>> ComputationNode<half>::s_constOnes{};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// instantiate the core class templates
|
||||
|
@ -1143,6 +1147,7 @@ template <> map<size_t, map<size_t, shared_ptr<DoubleMatrix>>> ComputationNode<d
|
|||
|
||||
template class ComputationNode<float>;
|
||||
template class ComputationNode<double>;
|
||||
template class ComputationNode<half>;
|
||||
|
||||
}}}
|
||||
|
||||
|
|
|
@ -1375,7 +1375,7 @@ public:
|
|||
m_inputs.resize(inputs.size());
|
||||
for (size_t i = 0; i < m_inputs.size(); i++)
|
||||
if (inputs[i])
|
||||
m_inputs[i] = DownCast(inputs[i]); // (DownCast() checks the type; the assignment then downcasts it again)
|
||||
m_inputs[i] = inputs[i]; // remove DownCast check here to allow CastNode to pass
|
||||
else
|
||||
m_inputs[i] = nullptr; // during network creation, nullptrs are possible
|
||||
|
||||
|
@ -1430,6 +1430,19 @@ protected:
|
|||
return DownCast(m_inputs[inputIndex]);
|
||||
}
|
||||
|
||||
template<typename InputType>
|
||||
inline shared_ptr<ComputationNode<InputType>> TypedInput(const size_t inputIndex) const
|
||||
{
|
||||
if (inputIndex >= m_inputs.size())
|
||||
LogicError("Inputs: inputIndex %d is out of range for %ls %ls operation.", (int)inputIndex, NodeName().c_str(), OperationName().c_str());
|
||||
|
||||
shared_ptr<ComputationNode<InputType>> node = dynamic_pointer_cast<ComputationNode<InputType>>(m_inputs[inputIndex]);
|
||||
if (!node)
|
||||
InvalidArgument("an TypedInput of mismatching precision was passed");
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
// Fast downcast without runtime type check of dynamic_pointer_cast.
|
||||
// Meant to be used in Forward and BackPropTo, assuming that Validate() has already used Input() which validated the correct types.
|
||||
inline ComputationNode<ElemType>& InputRef(const size_t inputIndex) const
|
||||
|
@ -1929,24 +1942,36 @@ protected:
|
|||
// if the matrix's size will scale with minibatch size, set mbScale = true
|
||||
// if workspace flag is true, the memory request will be treated specially. We assume workspace memory will share their own pointers
|
||||
// this is currently a workaround for workspace memory for convolutions
|
||||
void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize=0, bool mbScale=false, bool isWorkSpace=false, bool aliasing=false)
|
||||
template<typename ValueType>
|
||||
void TypedRequestMatrixFromPool(shared_ptr<Matrix<ValueType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize=0, bool mbScale=false, bool isWorkSpace=false, bool aliasing=false)
|
||||
{
|
||||
if (matrixPtr == nullptr)
|
||||
{
|
||||
if (aliasing)
|
||||
matrixPool.RequestAliasedAllocate<ElemType>(m_deviceId, this, &matrixPtr, matrixSize, mbScale);
|
||||
matrixPool.RequestAliasedAllocate<ValueType>(m_deviceId, this, &matrixPtr, matrixSize, mbScale);
|
||||
else
|
||||
matrixPool.RequestAllocate<ElemType>(m_deviceId, &matrixPtr, matrixSize, mbScale, isWorkSpace);
|
||||
matrixPool.RequestAllocate<ValueType>(m_deviceId, &matrixPtr, matrixSize, mbScale, isWorkSpace);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename ValueType>
|
||||
void TypedReleaseMatrixToPool(shared_ptr<Matrix<ValueType>>& matrixPtr, MatrixPool& matrixPool, bool aliasing=false)
|
||||
{
|
||||
assert(matrixPtr != nullptr);
|
||||
if (aliasing)
|
||||
matrixPool.RequestAliasedRelease<ValueType>(this);
|
||||
else
|
||||
matrixPool.RequestRelease<ValueType>(&matrixPtr);
|
||||
}
|
||||
|
||||
void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize = 0, bool mbScale = false, bool isWorkSpace = false, bool aliasing = false)
|
||||
{
|
||||
TypedRequestMatrixFromPool<ElemType>(matrixPtr, matrixPool, matrixSize, mbScale, isWorkSpace, aliasing);
|
||||
}
|
||||
|
||||
void ReleaseMatrixToPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, bool aliasing = false)
|
||||
{
|
||||
assert(matrixPtr != nullptr);
|
||||
if (aliasing)
|
||||
matrixPool.RequestAliasedRelease<ElemType>(this);
|
||||
else
|
||||
matrixPool.RequestRelease<ElemType>(&matrixPtr);
|
||||
TypedReleaseMatrixToPool<ElemType>(matrixPtr, matrixPool, aliasing);
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -2114,7 +2139,7 @@ public:
|
|||
s_constOnes[rows].find(cols) == s_constOnes[rows].end()) // not found
|
||||
{
|
||||
shared_ptr<Matrix<ElemType>> matrix = make_shared<Matrix<ElemType>>(rows, cols, (DEVICEID_TYPE) deviceId);
|
||||
matrix->SetValue(1);
|
||||
matrix->SetValue((ElemType)1);
|
||||
s_constOnes[rows][cols] = matrix;
|
||||
}
|
||||
|
||||
|
@ -2516,4 +2541,26 @@ public:
|
|||
|
||||
#pragma endregion base computation class
|
||||
|
||||
#define SMART_NODE_INVOKE(nodeClass, node, func, ...) \
|
||||
do { \
|
||||
if (dynamic_pointer_cast<nodeClass<float>>(node)) \
|
||||
dynamic_pointer_cast<nodeClass<float>>(node)->func(__VA_ARGS__); \
|
||||
else if (dynamic_pointer_cast<nodeClass<double>>(node)) \
|
||||
dynamic_pointer_cast<nodeClass<double>>(node)->func(__VA_ARGS__); \
|
||||
else if (dynamic_pointer_cast<nodeClass<half>>(node)) \
|
||||
dynamic_pointer_cast<nodeClass<half>>(node)->func(__VA_ARGS__); \
|
||||
else \
|
||||
LogicError("Unknown nodeClass type"); \
|
||||
} while(0)
|
||||
|
||||
#define SMART_NODE_INVOKE_WITH_RET(nodeClass, node, func, ret, ...) \
|
||||
do { \
|
||||
if (dynamic_pointer_cast<nodeClass<float>>(node)) \
|
||||
ret = dynamic_pointer_cast<nodeClass<float>>(node)->func(__VA_ARGS__); \
|
||||
else if (dynamic_pointer_cast<nodeClass<double>>(node)) \
|
||||
ret = dynamic_pointer_cast<nodeClass<double>>(node)->func(__VA_ARGS__); \
|
||||
else if (dynamic_pointer_cast<nodeClass<half>>(node)) \
|
||||
ret = dynamic_pointer_cast<nodeClass<half>>(node)->func(__VA_ARGS__); \
|
||||
else LogicError("Unknown ComputationNode type"); \
|
||||
} while(0)
|
||||
}}}
|
||||
|
|
|
@ -873,6 +873,7 @@ protected:
|
|||
|
||||
template class OneHotNode<float>;
|
||||
template class OneHotNode<double>;
|
||||
template class OneHotNode<half>;
|
||||
|
||||
#ifdef COMING_SOON
|
||||
|
||||
|
|
|
@ -104,7 +104,7 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
|
|||
static unsigned long randomSeed = 1;
|
||||
int forcedRandomSeed = configp->Get(L"randomSeed"); // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
|
||||
m_randomSeed = forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed;
|
||||
m_initValueScale = configp->Get(L"initValueScale");
|
||||
m_initValueScale = (ElemType)(float)configp->Get(L"initValueScale");
|
||||
m_initFilterRank = configp->Get(L"initFilterRank");
|
||||
m_initOutputRank = configp->Get(L"initOutputRank");
|
||||
m_initOnCPUOnly = configp->Get(L"initOnCPUOnly");
|
||||
|
@ -112,12 +112,12 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
|
|||
else if (initString == L"zero")
|
||||
{
|
||||
m_initString = L"fromValue";
|
||||
m_initValue = 0;
|
||||
m_initValue = (ElemType)0;
|
||||
}
|
||||
else if (initString == L"fromValue") // from 'initValue'
|
||||
{
|
||||
m_initString = initString;
|
||||
m_initValue = initValue;
|
||||
m_initValue = (ElemType)(float)initValue;
|
||||
}
|
||||
else if (initString == L"bilinear")
|
||||
{
|
||||
|
@ -138,7 +138,7 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
|
|||
else if (initString == L"fixedValue") // deprecated. Use initValue=... instead
|
||||
{
|
||||
m_initString = L"fromValue";
|
||||
m_initValue = (ElemType)configp->Get(L"value");
|
||||
m_initValue = (ElemType)(float)configp->Get(L"value");
|
||||
}
|
||||
else if (initString == L"fromLiteral") // deprecated. Use initValue=array instead
|
||||
{
|
||||
|
@ -549,7 +549,7 @@ void LearnableParameter<ElemType>::LazyInitParameters()
|
|||
if (m_initString == L"fromValue")
|
||||
{
|
||||
if (GetEnvironmentPtr() && Environment().traceLevel > 0) // note: this will not log before node is part of network
|
||||
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %f.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initValue);
|
||||
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %f.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), (float)m_initValue);
|
||||
Value().SetValue(m_initValue);
|
||||
}
|
||||
else if (ParseRandomizationType(m_initString).second != 0)
|
||||
|
@ -651,5 +651,6 @@ template <class ElemType>
|
|||
|
||||
template class LearnableParameter<float>;
|
||||
template class LearnableParameter<double>;
|
||||
template class LearnableParameter<half>;
|
||||
|
||||
}}}
|
||||
|
|
|
@ -108,7 +108,7 @@ private:
|
|||
{
|
||||
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f(%f*%f), onCPU=%s.\n)",
|
||||
NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(),
|
||||
(int)randomSeed, (int)fanOut, (int)fanIn, range, range/initValueScale, initValueScale, initOnCPUOnly ? "true" : "false");
|
||||
(int)randomSeed, (int)fanOut, (int)fanIn, (float)range, (float)(range/initValueScale), (float)(initValueScale), initOnCPUOnly ? "true" : "false");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -41,6 +41,9 @@ template void Microsoft::MSR::CNTK::UpdateRunningAverage<float>(ComputationNode<
|
|||
template void Microsoft::MSR::CNTK::UpdateRunningAverage<double>(ComputationNode<double>& newInput,
|
||||
TensorView<double>& runningAverage,
|
||||
size_t& runningCount);
|
||||
template void Microsoft::MSR::CNTK::UpdateRunningAverage<half>(ComputationNode<half>& newInput,
|
||||
TensorView<half>& runningAverage,
|
||||
size_t& runningCount);
|
||||
|
||||
template <class ElemType>
|
||||
EpochAccumulatorNode<ElemType>::EpochAccumulatorNode(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
|
@ -128,3 +131,4 @@ void EpochAccumulatorNode<ElemType>::Reset()
|
|||
|
||||
template class EpochAccumulatorNode<float>;
|
||||
template class EpochAccumulatorNode<double>;
|
||||
template class EpochAccumulatorNode<half>;
|
|
@ -91,6 +91,7 @@ public:
|
|||
|
||||
template class PlusNode<float>;
|
||||
template class PlusNode<double>;
|
||||
template class PlusNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// LogPlusNode (summand1, summand2)
|
||||
|
@ -165,6 +166,7 @@ public:
|
|||
|
||||
template class LogPlusNode<float>;
|
||||
template class LogPlusNode<double>;
|
||||
template class LogPlusNode<half>;
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -225,6 +227,7 @@ public:
|
|||
|
||||
template class PowNode<float>;
|
||||
template class PowNode<double>;
|
||||
template class PowNode<half>;
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -287,6 +290,7 @@ public:
|
|||
|
||||
template class MinusNode<float>;
|
||||
template class MinusNode<double>;
|
||||
template class MinusNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ElementTimesNode (factor1, factor2)
|
||||
|
@ -360,6 +364,7 @@ public:
|
|||
|
||||
template class ElementTimesNode<float>;
|
||||
template class ElementTimesNode<double>;
|
||||
template class ElementTimesNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// TimesNodeBase (A, B, outputRank=1)
|
||||
|
@ -606,7 +611,7 @@ private:
|
|||
Matrix<ElemType> inputValueSlice = unpackedInputValue.ColumnSlice(s * maxNumTimeSteps, maxNumTimeSteps); // k x s*
|
||||
inputValueSlice.Reshape(k * maxNumTimeSteps, 1); // (k * s*) x 1
|
||||
Matrix<ElemType> gradientSlice = Gradient().ColumnSlice(s, 1); // m x 1
|
||||
Matrix<ElemType>::MultiplyAndWeightedAdd(1, gradientSlice, false, inputValueSlice, true, unpacked[inputIndex] ? 0 : beta, inputGradientSlice);
|
||||
Matrix<ElemType>::MultiplyAndWeightedAdd(1, gradientSlice, false, inputValueSlice, true, unpacked[inputIndex] ? (ElemType)0 : beta, inputGradientSlice);
|
||||
}
|
||||
|
||||
if (unpacked[inputIndex])
|
||||
|
@ -624,7 +629,7 @@ private:
|
|||
Matrix<ElemType> inputValueSlice = unpackedInputValue.ColumnSlice(s * maxNumTimeSteps, maxNumTimeSteps); // (m * k) x s*
|
||||
inputValueSlice.Reshape(m, k * maxNumTimeSteps); // m x (k * s*)
|
||||
Matrix<ElemType> gradientSlice = Gradient().ColumnSlice(s, 1); // m x 1
|
||||
Matrix<ElemType>::MultiplyAndWeightedAdd(1, inputValueSlice, true, gradientSlice, false, unpacked[inputIndex] ? 0 : beta, inputGradientSlice);
|
||||
Matrix<ElemType>::MultiplyAndWeightedAdd(1, inputValueSlice, true, gradientSlice, false, unpacked[inputIndex] ? (ElemType)0 : beta, inputGradientSlice);
|
||||
}
|
||||
|
||||
if (unpacked[inputIndex])
|
||||
|
@ -1113,6 +1118,7 @@ public:
|
|||
|
||||
template class TimesNode<float>;
|
||||
template class TimesNode<double>;
|
||||
template class TimesNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// TransposeTimesNode (A', B)
|
||||
|
@ -1142,6 +1148,7 @@ public:
|
|||
|
||||
template class TransposeTimesNode<float>;
|
||||
template class TransposeTimesNode<double>;
|
||||
template class TransposeTimesNode<half>;
|
||||
|
||||
// Fixed-point matrix product. This scales inputs to 16bit signed integers by Symmetric quantizers, performs
|
||||
// integer multiplication using SSE/AVX2, and transforms the results back.
|
||||
|
@ -1232,6 +1239,7 @@ public:
|
|||
|
||||
template class QuantizedTimesNode<float>;
|
||||
template class QuantizedTimesNode<double>;
|
||||
template class QuantizedTimesNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// SumElementsNode (input)
|
||||
|
@ -1467,6 +1475,7 @@ private:
|
|||
|
||||
template class TransposeDimensionsNode<float>;
|
||||
template class TransposeDimensionsNode<double>;
|
||||
template class TransposeDimensionsNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// CosDistanceNode (left, right)
|
||||
|
@ -1587,6 +1596,7 @@ private:
|
|||
|
||||
template class CosDistanceNode<float>;
|
||||
template class CosDistanceNode<double>;
|
||||
template class CosDistanceNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// KhatriRaoProductNode (left, right)
|
||||
|
@ -1913,6 +1923,7 @@ private:
|
|||
|
||||
template class CosDistanceWithNegativeSamplesNode<float>;
|
||||
template class CosDistanceWithNegativeSamplesNode<double>;
|
||||
template class CosDistanceWithNegativeSamplesNode<half>;
|
||||
|
||||
template <class ElemType>
|
||||
void UpdateRunningAverage(ComputationNode<ElemType>& newInput, TensorView<ElemType>& runningAverage,
|
||||
|
@ -1992,4 +2003,48 @@ protected:
|
|||
size_t m_numSamples;
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// CastNode converts data types from InputType to ElemType
|
||||
// -----------------------------------------------------------------------
|
||||
template <class ElemType, class InputType>
|
||||
class CastNode : public UnaryElementWiseNode<ElemType>
|
||||
{
|
||||
typedef UnaryElementWiseNode<ElemType> Base; UsingUnaryElementwiseNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Cast"; }
|
||||
|
||||
public:
|
||||
CastNode(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: Base(deviceId, name)
|
||||
{
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
|
||||
{
|
||||
auto result = ValueFor(fr);
|
||||
auto input = static_cast<ComputationNode<InputType>&>(*m_inputs[0].get()).ValueFor(fr);
|
||||
result.CastAssignValuesOf(input);
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
|
||||
{
|
||||
auto grad = GradientFor(fr);
|
||||
auto inputGrad = static_cast<ComputationNode<InputType>&>(*m_inputs[0].get()).GradientFor(fr);
|
||||
inputGrad.CastAssignValuesOf(grad);
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
ValidateUnaryMap(isFinalValidationPass);
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
|
||||
};
|
||||
|
||||
template class CastNode<half, float>;
|
||||
template class CastNode<half, double>;
|
||||
template class CastNode<float, half>;
|
||||
template class CastNode<float, double>;
|
||||
template class CastNode<double, half>;
|
||||
template class CastNode<double, float>;
|
||||
}}}
|
||||
|
|
|
@ -72,6 +72,7 @@ public:
|
|||
protected:
|
||||
vector<MemRequestInfo<float>> m_memRequestInfoFloatVec;
|
||||
vector<MemRequestInfo<double>> m_memRequestInfoDoubleVec;
|
||||
vector<MemRequestInfo<half>> m_memRequestInfoHalfVec;
|
||||
set<DEVICEID_TYPE> m_deviceIDSet;
|
||||
int m_stepCounter;
|
||||
|
||||
|
@ -151,6 +152,7 @@ public:
|
|||
// MatrixPool is not templated, so we call both float and double versions here
|
||||
OptimizedMemoryAllocationFunc<float>();
|
||||
OptimizedMemoryAllocationFunc<double>();
|
||||
OptimizedMemoryAllocationFunc<half>();
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -446,6 +446,7 @@ public:
|
|||
|
||||
template class HardmaxNode<float>;
|
||||
template class HardmaxNode<double>;
|
||||
template class HardmaxNode<half>;
|
||||
|
||||
|
||||
|
||||
|
@ -517,7 +518,8 @@ public:
|
|||
CreateMatrixIfNull(m_steps);
|
||||
auto dim = Input(0)->GetSampleLayout().GetDimPadded(0);
|
||||
auto tmp = new ElemType[numCols];
|
||||
std::generate(tmp, tmp + numCols, [i = ElemType(0), dim]() mutable { auto ret = i; i += dim; return ret; });
|
||||
ElemType i = ElemType(0);
|
||||
std::generate(tmp, tmp + numCols, [&i, dim]() mutable { auto ret = i; i += dim; return ret; });
|
||||
m_steps->SetValue(1, numCols, this->m_deviceId, tmp);
|
||||
delete[] tmp;
|
||||
m_sortedIndices->ScaleAndAdd(ElemType(1), *m_steps, *m_sortedIndices);
|
||||
|
@ -768,7 +770,8 @@ public: \
|
|||
}; \
|
||||
\
|
||||
template class ClassName<float>; \
|
||||
template class ClassName<double>;
|
||||
template class ClassName<double>; \
|
||||
template class ClassName<half>;
|
||||
|
||||
DefineComparisonNode(LessNode, -1, 0)
|
||||
DefineComparisonNode(EqualNode, 0, 0)
|
||||
|
|
|
@ -341,5 +341,6 @@ void OptimizedRNNStackNode<ElemType>::UnpackSequencesFromCuDNN(const Matrix<Elem
|
|||
|
||||
template class OptimizedRNNStackNode<float>;
|
||||
template class OptimizedRNNStackNode<double>;
|
||||
template class OptimizedRNNStackNode<half>;
|
||||
|
||||
}}}
|
||||
|
|
|
@ -587,9 +587,11 @@ template<class ElemType, int direction>
|
|||
// instantiate the classes that derive from the above
|
||||
template class PastValueNode<float>;
|
||||
template class PastValueNode<double>;
|
||||
template class PastValueNode<half>;
|
||||
|
||||
template class FutureValueNode<float>;
|
||||
template class FutureValueNode<double>;
|
||||
template class FutureValueNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// DelayedValueNodeState -- helper class for exporting/importing state from/to DelayedValueNodes.
|
||||
|
|
|
@ -373,6 +373,7 @@ template <class ElemType>
|
|||
|
||||
template class ReduceElementsNode<float>;
|
||||
template class ReduceElementsNode<double>;
|
||||
template class ReduceElementsNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Where(bitVector) -- extract indices of non-0 values in a sequence
|
||||
|
@ -490,6 +491,7 @@ template <class ElemType>
|
|||
|
||||
template class WhereNode<float>;
|
||||
template class WhereNode<double>;
|
||||
template class WhereNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// PackedIndexNode(targetObject, indexSequence) -- map sequence
|
||||
|
@ -551,6 +553,7 @@ template <class ElemType>
|
|||
|
||||
template class PackedIndexNode<float>;
|
||||
template class PackedIndexNode<double>;
|
||||
template class PackedIndexNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// GatherPackedNode(packedIndex, sourceData) -- gather operation
|
||||
|
@ -623,6 +626,7 @@ template <class ElemType>
|
|||
|
||||
template class GatherPackedNode<float>;
|
||||
template class GatherPackedNode<double>;
|
||||
template class GatherPackedNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ScatterPackedNode(layoutData, packedIndex, sourceData) -- scatter operation
|
||||
|
@ -686,6 +690,7 @@ template <class ElemType>
|
|||
|
||||
template class ScatterPackedNode<float>;
|
||||
template class ScatterPackedNode<double>;
|
||||
template class ScatterPackedNode<half>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// CropNode -- crop operation, crops first input according to shape of second
|
||||
|
@ -1026,5 +1031,6 @@ bool CropNode<ElemType>::SupportsTransformOnInput(size_t inputIndex)
|
|||
|
||||
template class CropNode<float>;
|
||||
template class CropNode<double>;
|
||||
template class CropNode<half>;
|
||||
|
||||
}}}
|
||||
|
|
|
@ -878,8 +878,8 @@ public:
|
|||
|
||||
|
||||
public:
|
||||
PaddingNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector<size_t> head, std::vector<size_t> foot, PaddingType mode = PaddingType::CONSTANTPAD, ElemType constantValue = 0)
|
||||
: Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value(constantValue)
|
||||
PaddingNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector<size_t> head, std::vector<size_t> foot, PaddingType mode = PaddingType::CONSTANTPAD, double constantValue = 0)
|
||||
: Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value((ElemType)constantValue)
|
||||
{
|
||||
}
|
||||
|
||||
|
|
|
@ -312,8 +312,8 @@ class UnpackSequenceNode : public ComputationNodeNonLooping<ElemType>, public Mu
|
|||
|
||||
public:
|
||||
DeclareConstructorFromConfig(UnpackSequenceNode);
|
||||
UnpackSequenceNode(DEVICEID_TYPE deviceId, const wstring& name, ElemType paddingValue = 0, bool suppressMaskOutput = false)
|
||||
: Base(deviceId, name), MultiOutputNode<ElemType>(suppressMaskOutput ? 1 : 2), m_paddingValue(paddingValue), m_suppressMaskOutput(suppressMaskOutput)
|
||||
UnpackSequenceNode(DEVICEID_TYPE deviceId, const wstring& name, double paddingValue = 0, bool suppressMaskOutput = false)
|
||||
: Base(deviceId, name), MultiOutputNode<ElemType>(suppressMaskOutput ? 1 : 2), m_paddingValue((ElemType)paddingValue), m_suppressMaskOutput(suppressMaskOutput)
|
||||
{}
|
||||
|
||||
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
|
||||
|
|
|
@ -144,5 +144,6 @@ template <class ElemType>
|
|||
|
||||
template class TraceNode<float>;
|
||||
template class TraceNode<double>;
|
||||
template class TraceNode<half>;
|
||||
|
||||
}}}
|
||||
|
|
|
@ -47,6 +47,7 @@ template <class ElemType>
|
|||
|
||||
template class RandomDistributionNode<float>;
|
||||
template class RandomDistributionNode<double>;
|
||||
template class RandomDistributionNode<half>;
|
||||
|
||||
template<class ElemType>
|
||||
void RandomSampleNodeBase<ElemType>::Validate(bool isFinalValidationPass)
|
||||
|
@ -108,9 +109,9 @@ void RandomSampleNodeBase<ElemType>::UpdateWeightsPrefixSum()
|
|||
{
|
||||
ElemType currentWeight = samplingWeights.GetValue(iClass, 0);
|
||||
if (currentWeight < 0)
|
||||
InvalidArgument("Sampling weights contain negative number %f.", currentWeight);
|
||||
InvalidArgument("Sampling weights contain negative number %f.", (float)currentWeight);
|
||||
|
||||
runningWeightsSum += currentWeight;
|
||||
runningWeightsSum += (double)currentWeight;
|
||||
m_samplingWeightsPrefixSum.push_back(runningWeightsSum);
|
||||
}
|
||||
}
|
||||
|
@ -231,6 +232,7 @@ bool RandomSampleNode<ElemType>::IsOutOfDateWrtInputs() const
|
|||
|
||||
template class RandomSampleNode<float>;
|
||||
template class RandomSampleNode<double>;
|
||||
template class RandomSampleNode<half>;
|
||||
|
||||
template<class ElemType>
|
||||
double RandomSampleInclusionFrequencyNode<ElemType>::EstimateNumberOfTries()
|
||||
|
@ -303,6 +305,7 @@ void RandomSampleInclusionFrequencyNode<ElemType>::Validate(bool isFinalValidati
|
|||
|
||||
template class RandomSampleInclusionFrequencyNode<float>;
|
||||
template class RandomSampleInclusionFrequencyNode<double>;
|
||||
template class RandomSampleInclusionFrequencyNode<half>;
|
||||
|
||||
template<class ElemType>
|
||||
void DropoutNode<ElemType>::Save(File& fstream) const
|
||||
|
@ -336,8 +339,10 @@ void BatchNormalizationNode<ElemType>::AttachInputs(const std::vector<Computatio
|
|||
|
||||
template class DropoutNode<float>;
|
||||
template class DropoutNode<double>;
|
||||
template class DropoutNode<half>;
|
||||
|
||||
template class BatchNormalizationNode<float>;
|
||||
template class BatchNormalizationNode<double>;
|
||||
template class BatchNormalizationNode<half>;
|
||||
|
||||
}}}
|
||||
|
|
|
@ -525,7 +525,7 @@ public:
|
|||
{
|
||||
Url& UrlJ = *itUrlJ;
|
||||
discountJ = m_logWeights[UrlJ.m_rank];
|
||||
if (abs(gainI - UrlJ.m_gain) < 0.0000001)
|
||||
if (abs(gainI - UrlJ.m_gain) < (ElemType)0.0000001)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
@ -534,7 +534,7 @@ public:
|
|||
lambdaIJ = (gainI - UrlJ.m_gain) * (discountI - discountJ) / (discountI * discountJ);
|
||||
|
||||
// |delta NDCG|
|
||||
lambdaIJ = (idealMetric == 0.0 ? (ElemType) 0.0 : abs(lambdaIJ / idealMetric));
|
||||
lambdaIJ = (idealMetric == (ElemType)0.0 ? (ElemType) 0.0 : (ElemType)abs(lambdaIJ / idealMetric));
|
||||
|
||||
// Combine lambda
|
||||
lambdaIJ = lambdas(0, pairsCount++) * lambdaIJ;
|
||||
|
@ -2394,6 +2394,8 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
|
|||
typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName() { return L"BatchNormalization"; }
|
||||
|
||||
typedef typename std::conditional<std::is_same<ElemType, half>::value, float, ElemType>::type StatType;
|
||||
|
||||
// inputs
|
||||
// TODO: Change all of these throughout the codebase to 'class enum'. Also change all places where we still use integer constants.
|
||||
static const size_t DATA = 0;
|
||||
|
@ -2412,7 +2414,7 @@ public:
|
|||
m_one(1, 1, deviceId),
|
||||
m_convertRunningVariancePending(false)
|
||||
{
|
||||
m_one.SetValue((ElemType)1); // (constant value used for GPU-side update of runCount)
|
||||
m_one.SetValue((StatType)1); // (constant value used for GPU-side update of runCount)
|
||||
}
|
||||
BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
|
||||
|
@ -2564,14 +2566,14 @@ private: // time-constant conversions
|
|||
void ResetRunCount()
|
||||
{
|
||||
if (HasTiedRunCount())
|
||||
Input(RUN_COUNT)->Value().SetValue(0);
|
||||
this->template TypedInput<StatType>(RUN_COUNT)->Value().SetValue(0);
|
||||
m_runCountUntied = 0;
|
||||
}
|
||||
void AggregateRunCount(size_t countToAdd)
|
||||
{
|
||||
if (HasTiedRunCount())
|
||||
{
|
||||
Input(RUN_COUNT)->Value().AddWithScaleOf(/*alpha=*/(ElemType)countToAdd, m_one); // this += countToAdd * (1)
|
||||
this->template TypedInput<StatType>(RUN_COUNT)->Value().AddWithScaleOf(/*alpha=*/(StatType)countToAdd, m_one); // this += countToAdd * (1)
|
||||
if (countToAdd != 0)
|
||||
m_runCountUntied = SIZE_MAX; // we only need this for 0 checks, this value says we only know it's not 0
|
||||
}
|
||||
|
@ -2581,7 +2583,7 @@ private: // time-constant conversions
|
|||
size_t RunCount() const // const version of above; keep identical
|
||||
{
|
||||
if (HasTiedRunCount())
|
||||
m_runCountUntied = (size_t)Input(RUN_COUNT)->Value().Get00Element(); // if needed then cache it over
|
||||
m_runCountUntied = (size_t)this->template TypedInput<StatType>(RUN_COUNT)->Value().Get00Element(); // if needed then cache it over
|
||||
return m_runCountUntied;
|
||||
}
|
||||
bool IsRunCount0() const { return m_runCountUntied == 0 && RunCount() == 0; } // tied count >= untied one, so we can ask the untied one first to avoid GPU sync
|
||||
|
@ -2664,10 +2666,10 @@ public:
|
|||
FrameRange fr(Input(DATA)->GetMBLayout());
|
||||
|
||||
Matrix<ElemType> sliceInputValue = Input(DATA)->MaskedValueFor(fr);
|
||||
const Matrix<ElemType>& scale = Input(SCALE)->Value();
|
||||
const Matrix<ElemType>& bias = Input(BIAS)->Value();
|
||||
Matrix<ElemType>& runMean = Input(RUN_MEAN)->Value();
|
||||
Matrix<ElemType>& runVariance = Input(RUN_VAR)->Value();
|
||||
const Matrix<StatType>& scale = this->template TypedInput<StatType>(SCALE)->Value();
|
||||
const Matrix<StatType>& bias = this->template TypedInput<StatType>(BIAS)->Value();
|
||||
Matrix<StatType>& runMean = this->template TypedInput<StatType>(RUN_MEAN)->Value();
|
||||
Matrix<StatType>& runVariance = this->template TypedInput<StatType>(RUN_VAR)->Value();
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
assert(scale.GetNumRows() == bias.GetNumRows());
|
||||
|
@ -2718,8 +2720,8 @@ public:
|
|||
{
|
||||
auto sliceOutputGrad = MaskedGradientFor(fr);
|
||||
auto sliceInputValue = Input(DATA)->ValueFor(fr);
|
||||
const Matrix<ElemType>& scale = Input(SCALE)->Value();
|
||||
const Matrix<ElemType>& bias = Input(BIAS)->Value();
|
||||
const Matrix<StatType>& scale = this->template TypedInput<StatType>(SCALE)->Value();
|
||||
const Matrix<StatType>& bias = this->template TypedInput<StatType>(BIAS)->Value();
|
||||
|
||||
// If inputIndex is not DATA and we get here, then it means that DATA receives no gradient.
|
||||
// However, the underlying engine does not foresee this case, and thus always needs a place
|
||||
|
@ -2752,19 +2754,19 @@ public:
|
|||
{
|
||||
assert(m_gradientValid);
|
||||
|
||||
if (Input(SCALE)->IsGradientInitializedBy(this))
|
||||
Input(SCALE)->Gradient().AssignValuesOf(*m_dScale);
|
||||
if (this->template TypedInput<StatType>(SCALE)->IsGradientInitializedBy(this))
|
||||
this->template TypedInput<StatType>(SCALE)->Gradient().AssignValuesOf(*m_dScale);
|
||||
else
|
||||
Input(SCALE)->Gradient() += *m_dScale;
|
||||
this->template TypedInput<StatType>(SCALE)->Gradient() += *m_dScale;
|
||||
}
|
||||
else if (inputIndex == BIAS) // derivative with respect to the bias, precomputed during input derivative computation
|
||||
{
|
||||
assert(m_gradientValid);
|
||||
|
||||
if (Input(BIAS)->IsGradientInitializedBy(this))
|
||||
Input(BIAS)->Gradient().AssignValuesOf(*m_dBias);
|
||||
if (this->template TypedInput<StatType>(BIAS)->IsGradientInitializedBy(this))
|
||||
this->template TypedInput<StatType>(BIAS)->Gradient().AssignValuesOf(*m_dBias);
|
||||
else
|
||||
Input(BIAS)->Gradient() += *m_dBias;
|
||||
this->template TypedInput<StatType>(BIAS)->Gradient() += *m_dBias;
|
||||
}
|
||||
// No derivatives with respect to running mean and variance.
|
||||
}
|
||||
|
@ -2797,7 +2799,7 @@ public:
|
|||
// running statistics inputs must be learnable parameters, since we update them directly here
|
||||
for (size_t i = RUN_MEAN; i < GetNumInputs(); i++)
|
||||
//if (!Input(i)->Is<LearnableParameter<ElemType>>()) // somehow this does not compile on gcc (works on VS)
|
||||
if (!dynamic_cast<LearnableParameter<ElemType>*>(Input(i).get()))
|
||||
if (!dynamic_cast<LearnableParameter<StatType>*>(this->template TypedInput<StatType>(i).get()))
|
||||
InvalidArgument("%ls: Inputs [%d..%d] must be learnable parameters.", NodeDescription().c_str(), (int)RUN_MEAN, (int)GetNumInputs());
|
||||
|
||||
// infer dimensions of learnable parameters
|
||||
|
@ -2807,7 +2809,7 @@ public:
|
|||
#if 1 // Workaround for today's definition: Trigger on [0 x 1] and infer that 0 as the total # elements needed.
|
||||
for (size_t i = SCALE; i < RUN_COUNT; i++) // scale, bias, run_mean, and run_variance
|
||||
{
|
||||
auto paramLayout = Input(i)->GetSampleLayout();
|
||||
auto paramLayout = this->template TypedInput<StatType>(i)->GetSampleLayout();
|
||||
if (paramLayout.GetRank() == 2 && paramLayout[0] == 0 && paramLayout[1] == 1 && inputLayout.GetNumElements() > 0) // [0 x 1]
|
||||
{
|
||||
size_t total = m_spatial ? inputLayout.GetDims().back() : inputLayout.GetNumElements();
|
||||
|
@ -2841,10 +2843,11 @@ public:
|
|||
// check inputs
|
||||
for (size_t i = SCALE; i < RUN_COUNT; i++) // scale, bias, run_mean, and run_variance
|
||||
{
|
||||
if (Input(i)->HasMBLayout())
|
||||
auto inputPtr = this->template TypedInput<StatType>(i);
|
||||
if (inputPtr->HasMBLayout())
|
||||
InvalidArgument("%ls: Input[%d] has a dynamic axis. BatchNormalization parameters cannot have that.", NodeDescription().c_str(), (int)i);
|
||||
auto paramLayout = Input(i)->GetSampleLayout();
|
||||
if (paramLayout != Input(SCALE)->GetSampleLayout())
|
||||
auto paramLayout = inputPtr->GetSampleLayout();
|
||||
if (paramLayout != this->template TypedInput<StatType>(SCALE)->GetSampleLayout())
|
||||
InvalidArgument("%ls: Input[%d] has a layout different from Input[1]. All must be identical.", NodeDescription().c_str(), (int)i);
|
||||
#if 0 // BUGBUG: For this to work, parameter shapes must be correct (cf. comment above on inference).
|
||||
if (paramLayout.GetRank() > inputLayout.GetRank())
|
||||
|
@ -2857,9 +2860,9 @@ public:
|
|||
if (HasTiedRunCount()) // 0-th order statistics (count) (optional for backcompat with old code which didn't correctly share it)
|
||||
{
|
||||
// This must always be a [1] tensor. No inference allowed.
|
||||
size_t i = RUN_COUNT;
|
||||
if (Input(i)->HasMBLayout() || (Input(i)->GetSampleLayout().GetRank() > 1) || (Input(i)->GetSampleLayout().GetNumElements() != 1))
|
||||
InvalidArgument("%ls: Input[%d] must be a vector of 1 element without dynamic axis.", NodeDescription().c_str(), (int)i);
|
||||
auto inputPtr = this->template TypedInput<StatType>(RUN_COUNT);
|
||||
if (inputPtr->HasMBLayout() || (inputPtr->GetSampleLayout().GetRank() > 1) || (inputPtr->GetSampleLayout().GetNumElements() != 1))
|
||||
InvalidArgument("%ls: Input[RUN_COUNT] must be a vector of 1 element without dynamic axis.", NodeDescription().c_str());
|
||||
RunCount(); // cache the shared value into the local cache, for 0 checks
|
||||
}
|
||||
if (m_spatial && m_imageLayoutKind != CHW)
|
||||
|
@ -2902,7 +2905,7 @@ public:
|
|||
if (m_bnEng == nullptr)
|
||||
{
|
||||
auto shape = GetSampleLayout();
|
||||
m_bnEng = BatchNormEngine<ElemType>::Create(m_deviceId, shape, m_spatial, m_imageLayoutKind,
|
||||
m_bnEng = BatchNormEngine<ElemType, StatType>::Create(m_deviceId, shape, m_spatial, m_imageLayoutKind,
|
||||
m_useCntkEngine ? BatchNormEngineKind::Cntk : BatchNormEngineKind::CuDnn);
|
||||
}
|
||||
|
||||
|
@ -2916,26 +2919,26 @@ public:
|
|||
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
||||
RequestMatrixFromPool(m_savedMean, matrixPool);
|
||||
RequestMatrixFromPool(m_savedInvStdDev, matrixPool);
|
||||
this->template TypedRequestMatrixFromPool<StatType>(m_savedMean, matrixPool);
|
||||
this->template TypedRequestMatrixFromPool<StatType>(m_savedInvStdDev, matrixPool);
|
||||
}
|
||||
|
||||
void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::RequestMatricesBeforeBackprop(matrixPool);
|
||||
RequestMatrixFromPool(m_dDataDummy, matrixPool);
|
||||
RequestMatrixFromPool(m_dScale, matrixPool);
|
||||
RequestMatrixFromPool(m_dBias, matrixPool);
|
||||
this->template TypedRequestMatrixFromPool<StatType>(m_dScale, matrixPool);
|
||||
this->template TypedRequestMatrixFromPool<StatType>(m_dBias, matrixPool);
|
||||
}
|
||||
|
||||
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_savedMean, matrixPool);
|
||||
ReleaseMatrixToPool(m_savedInvStdDev, matrixPool);
|
||||
this->template TypedReleaseMatrixToPool<StatType>(m_savedMean, matrixPool);
|
||||
this->template TypedReleaseMatrixToPool<StatType>(m_savedInvStdDev, matrixPool);
|
||||
ReleaseMatrixToPool(m_dDataDummy, matrixPool);
|
||||
ReleaseMatrixToPool(m_dScale, matrixPool);
|
||||
ReleaseMatrixToPool(m_dBias, matrixPool);
|
||||
this->template TypedReleaseMatrixToPool<StatType>(m_dScale, matrixPool);
|
||||
this->template TypedReleaseMatrixToPool<StatType>(m_dBias, matrixPool);
|
||||
}
|
||||
|
||||
void SetNormalizationTimeConstants(double normalizationTimeConstant, double prevNormalizationTimeConstant,
|
||||
|
@ -2970,8 +2973,8 @@ public:
|
|||
// Turn off the L1 and L2 regularization
|
||||
void DisableRegInBatchNormalization()
|
||||
{
|
||||
let scaleNode = dynamic_pointer_cast<LearnableParameter<ElemType>>(Input(SCALE));
|
||||
let biasNode = dynamic_pointer_cast<LearnableParameter<ElemType>>(Input(BIAS));
|
||||
let scaleNode = dynamic_pointer_cast<LearnableParameter<StatType>>(this->template TypedInput<StatType>(SCALE));
|
||||
let biasNode = dynamic_pointer_cast<LearnableParameter<StatType>>(this->template TypedInput<StatType>(BIAS));
|
||||
scaleNode->SetRegMultiplier(0.f);
|
||||
biasNode->SetRegMultiplier(0.f);
|
||||
}
|
||||
|
@ -3046,20 +3049,20 @@ private:
|
|||
// This value is not updated unless needed, so it may be out of date during most operation.
|
||||
// It will be updated at start (Validate()) and saving models, and any time the true value is needed.
|
||||
mutable size_t m_runCountUntied; // cached running sample count (mutable since it is a cache)
|
||||
Matrix<ElemType> m_one; // constant [1x1] matrix that contains a 1 (used for updating the shared count)
|
||||
Matrix<StatType> m_one; // constant [1x1] matrix that contains a 1 (used for updating the shared count)
|
||||
|
||||
// Interpolated actual mean/inverse stddev values. Pre-computed on forward pass, also used in gradient computation.
|
||||
shared_ptr<Matrix<ElemType>> m_savedMean;
|
||||
shared_ptr<Matrix<ElemType>> m_savedInvStdDev;
|
||||
shared_ptr<Matrix<StatType>> m_savedMean;
|
||||
shared_ptr<Matrix<StatType>> m_savedInvStdDev;
|
||||
// Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls.
|
||||
// Not used for blendFactor=1 in CNTK engine.
|
||||
shared_ptr<Matrix<ElemType>> m_dDataDummy;
|
||||
shared_ptr<Matrix<ElemType>> m_dScale;
|
||||
shared_ptr<Matrix<ElemType>> m_dBias;
|
||||
shared_ptr<Matrix<StatType>> m_dScale;
|
||||
shared_ptr<Matrix<StatType>> m_dBias;
|
||||
|
||||
bool m_gradientValid = false;
|
||||
|
||||
std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
|
||||
std::unique_ptr<BatchNormEngine<ElemType, StatType>> m_bnEng;
|
||||
|
||||
bool m_convertRunningVariancePending;
|
||||
};
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
@ -33,13 +33,11 @@
|
|||
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
@ -33,14 +33,12 @@
|
|||
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CLRSupport>true</CLRSupport>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CLRSupport>true</CLRSupport>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
|
@ -64,7 +62,7 @@
|
|||
<Link>
|
||||
<AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>Cntk.Eval-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;$(MSMPI_LIB64)msmpi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<DelayLoadDLLs>Cntk.Eval-$(CntkComponentVersion).dll;Cntk.Math-$(CntkComponentVersion).dll</DelayLoadDLLs>
|
||||
<DelayLoadDLLs>Cntk.Eval-$(CntkComponentVersion).dll;Cntk.Math-$(CntkComponentVersion).dll;msmpi.dll</DelayLoadDLLs>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(DebugBuild)">
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
@ -33,13 +33,11 @@
|
|||
<PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
|
@ -64,7 +62,7 @@
|
|||
</ClCompile>
|
||||
<Link>
|
||||
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration);$(OpenCvLibPath)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>Cntk.Common-$(CntkComponentVersion).lib;$(OpenCvLib);%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>$(OpenCvLib);%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(DebugBuild)">
|
||||
|
|
|
@ -2,9 +2,7 @@
|
|||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<ClCompile Include="ImageWriter.cpp" />
|
||||
<ClCompile Include="dllmain.cpp">
|
||||
<Filter>Misc</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="dllmain.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="ImageWriter.h" />
|
||||
|
|
|
@ -6,13 +6,13 @@
|
|||
#include "stdafx.h"
|
||||
#include "BatchNormalizationEngine.h"
|
||||
#include "CuDnnFactories.h"
|
||||
#include "Mkl2017DnnCommon.h"
|
||||
#include "MklDnnCommon.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <class ElemType>
|
||||
void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev)
|
||||
template <class InoutType, class StatType>
|
||||
void BatchNormEngine<InoutType, StatType>::Forward(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
|
||||
InoutMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev)
|
||||
{
|
||||
assert(in.GetNumRows() == m_inOutT.GetNumElements());
|
||||
assert(out.GetNumRows() == m_inOutT.GetNumElements());
|
||||
|
@ -63,9 +63,9 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
|
|||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor,
|
||||
const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad)
|
||||
template <class InoutType, class StatType>
|
||||
void BatchNormEngine<InoutType, StatType>::Backward(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor,
|
||||
const StatMat& savedMean, const StatMat& savedInvStdDev, StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad)
|
||||
{
|
||||
assert(!savedMean.IsEmpty());
|
||||
assert(!savedInvStdDev.IsEmpty());
|
||||
|
@ -73,12 +73,13 @@ void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat&
|
|||
BackwardCore(in, srcGrad, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
class CntkBatchNormEngine : public BatchNormEngine<ElemType>
|
||||
template <class InoutType, class StatType>
|
||||
class CntkBatchNormEngine : public BatchNormEngine<InoutType, StatType>
|
||||
{
|
||||
public:
|
||||
using Base = BatchNormEngine<ElemType>;
|
||||
using typename Base::Mat;
|
||||
using Base = BatchNormEngine<InoutType, StatType>;
|
||||
using typename Base::InoutMat;
|
||||
using typename Base::StatMat;
|
||||
|
||||
public:
|
||||
CntkBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
|
@ -99,28 +100,30 @@ protected:
|
|||
InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout.");
|
||||
}
|
||||
|
||||
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
|
||||
void ForwardCore(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
|
||||
InoutMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev) override
|
||||
{
|
||||
#ifdef USE_MKL2017DNN
|
||||
if (in.GetCurrentMatrixLocation() == CPU &&
|
||||
ForwardCoreMKL(in, scale, bias, inferenceOnly, expAvgFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev))
|
||||
std::is_same<InoutType, StatType>::value &&
|
||||
ForwardCoreMKL(*(const StatMat*)&in, scale, bias, inferenceOnly, expAvgFactor, runMean, runVariance, *(StatMat*)&out, epsilon, savedMean, savedInvStdDev))
|
||||
return;
|
||||
#endif
|
||||
|
||||
in.BatchNormalizationForward(scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev);
|
||||
}
|
||||
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) override
|
||||
void BackwardCore(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& savedMean, const StatMat& savedInvStdDev,
|
||||
StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) override
|
||||
{
|
||||
#ifdef USE_MKL2017DNN
|
||||
if (srcGrad.GetCurrentMatrixLocation() == CPU &&
|
||||
BackwardCoreMKL(in, srcGrad, grad, scale, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad))
|
||||
std::is_same<InoutType, StatType>::value &&
|
||||
BackwardCoreMKL(*(const StatMat*)&in, *(const StatMat*)&srcGrad, *(StatMat*)&grad, scale, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad))
|
||||
return;
|
||||
#endif
|
||||
if (!accumulateDataGrad)
|
||||
grad.SetValue((ElemType)0);
|
||||
grad.SetValue((InoutType)0);
|
||||
|
||||
srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad);
|
||||
}
|
||||
|
@ -147,7 +150,7 @@ private:
|
|||
struct MKLScaleShiftAdapter
|
||||
{
|
||||
bool isInput;
|
||||
std::shared_ptr<Matrix<ElemType>> mat;
|
||||
std::shared_ptr<Matrix<StatType>> mat;
|
||||
dnnResourceType_t resourceType;
|
||||
size_t numChannels;
|
||||
|
||||
|
@ -155,19 +158,19 @@ private:
|
|||
{
|
||||
Clear();
|
||||
numChannels = n;
|
||||
mat = std::make_shared<Matrix<ElemType>>(numChannels, 2, CPUDEVICE);
|
||||
mat = std::make_shared<Matrix<StatType>>(numChannels, 2, CPUDEVICE);
|
||||
isInput = userToPrim;
|
||||
resourceType = rt;
|
||||
}
|
||||
|
||||
void PrepareForExecution(void* scale, void* bias, void* resources[dnnResourceNumber])
|
||||
{
|
||||
ElemType* buffer = mat->Data();
|
||||
StatType* buffer = mat->Data();
|
||||
resources[resourceType] = buffer;
|
||||
if (isInput)
|
||||
{
|
||||
memcpy(buffer, scale, sizeof(ElemType) * numChannels);
|
||||
memcpy(buffer + numChannels, bias, sizeof(ElemType) * numChannels);
|
||||
memcpy(buffer, scale, sizeof(StatType) * numChannels);
|
||||
memcpy(buffer + numChannels, bias, sizeof(StatType) * numChannels);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -176,9 +179,9 @@ private:
|
|||
if (isInput)
|
||||
RuntimeError("Cannot execute output ResourceAdapter for input");
|
||||
|
||||
ElemType* buffer = mat->Data();
|
||||
memcpy(scale, buffer, sizeof(ElemType) * numChannels);
|
||||
memcpy(bias, buffer + numChannels, sizeof(ElemType) * numChannels);
|
||||
StatType* buffer = mat->Data();
|
||||
memcpy(scale, buffer, sizeof(StatType) * numChannels);
|
||||
memcpy(bias, buffer + numChannels, sizeof(StatType) * numChannels);
|
||||
}
|
||||
|
||||
void Clear()
|
||||
|
@ -194,21 +197,21 @@ private:
|
|||
|
||||
struct PrimitiveContext
|
||||
{
|
||||
MKLDnnResourceAdapter<ElemType> input;
|
||||
MKLDnnResourceAdapter<ElemType> output;
|
||||
MKLDnnResourceAdapter<StatType> input;
|
||||
MKLDnnResourceAdapter<StatType> output;
|
||||
MKLScaleShiftAdapter scaleShift;
|
||||
std::shared_ptr<Mat> varianceMat; // variance matrix used for converting InvStdDev
|
||||
std::shared_ptr<StatMat> varianceMat; // variance matrix used for converting InvStdDev
|
||||
|
||||
dnnPrimitive_t primitive = nullptr;
|
||||
dnnPrimitiveAttributes_t attributes = nullptr;
|
||||
|
||||
void Clear()
|
||||
{
|
||||
if (primitive) { dnnDelete<ElemType>(primitive); primitive = nullptr; }
|
||||
if (primitive) { dnnDelete<StatType>(primitive); primitive = nullptr; }
|
||||
input.Clear();
|
||||
scaleShift.Clear();
|
||||
output.Clear();
|
||||
if (attributes) { dnnPrimitiveAttributesDestroy<ElemType>(attributes); attributes = nullptr; }
|
||||
if (attributes) { dnnPrimitiveAttributesDestroy<StatType>(attributes); attributes = nullptr; }
|
||||
}
|
||||
|
||||
~PrimitiveContext()
|
||||
|
@ -219,7 +222,7 @@ private:
|
|||
|
||||
TensorShape m_shape;
|
||||
size_t m_numSamples;
|
||||
ElemType m_epsilon;
|
||||
StatType m_epsilon;
|
||||
|
||||
public:
|
||||
MKLBatchNormalizationContext() :
|
||||
|
@ -233,12 +236,12 @@ private:
|
|||
return !!(m_contextFlags & (1 << contextIndex));
|
||||
}
|
||||
|
||||
void Prepare(const TensorShape& shape, bool spatial, size_t numSamples, ContextIndex contextIndex, ElemType epsilon = 0)
|
||||
void Prepare(const TensorShape& shape, bool spatial, size_t numSamples, ContextIndex contextIndex, StatType epsilon = 0)
|
||||
{
|
||||
int flag = (1 << contextIndex);
|
||||
if (contextIndex == ContextIndex_Backward)
|
||||
{
|
||||
epsilon = HasPreparedFor(ContextIndex_ForwardTrain) ? m_epsilon : (ElemType)DEFAULT_EPSILON;
|
||||
epsilon = HasPreparedFor(ContextIndex_ForwardTrain) ? m_epsilon : (StatType)DEFAULT_EPSILON;
|
||||
}
|
||||
|
||||
bool same = (shape == m_shape) && (numSamples == m_numSamples) && (epsilon == m_epsilon);
|
||||
|
@ -285,10 +288,10 @@ private:
|
|||
{
|
||||
case ContextIndex_ForwardInfer:
|
||||
case ContextIndex_ForwardTrain:
|
||||
CHECK_MKL(dnnLayoutCreate<ElemType>(<UserInput, inoutDim, inoutSizes, inoutStrides));
|
||||
CHECK_MKL(dnnLayoutCreate<ElemType>(<UserOutput, inoutDim, inoutSizes, inoutStrides));
|
||||
CHECK_MKL(dnnPrimitiveAttributesCreate<ElemType>(&ctx.attributes));
|
||||
CHECK_MKL(dnnBatchNormalizationCreateForward_v2<ElemType>(
|
||||
CHECK_MKL(dnnLayoutCreate<StatType>(<UserInput, inoutDim, inoutSizes, inoutStrides));
|
||||
CHECK_MKL(dnnLayoutCreate<StatType>(<UserOutput, inoutDim, inoutSizes, inoutStrides));
|
||||
CHECK_MKL(dnnPrimitiveAttributesCreate<StatType>(&ctx.attributes));
|
||||
CHECK_MKL(dnnBatchNormalizationCreateForward_v2<StatType>(
|
||||
&ctx.primitive,
|
||||
ctx.attributes,
|
||||
ltUserInput,
|
||||
|
@ -299,10 +302,10 @@ private:
|
|||
scaleShiftType = dnnResourceScaleShift;
|
||||
break;
|
||||
case ContextIndex_Backward:
|
||||
CHECK_MKL(dnnLayoutCreate<ElemType>(<UserInput, inoutDim, inoutSizes, inoutStrides));
|
||||
CHECK_MKL(dnnLayoutCreate<ElemType>(<UserOutput, inoutDim, inoutSizes, inoutStrides));
|
||||
CHECK_MKL(dnnPrimitiveAttributesCreate<ElemType>(&ctx.attributes));
|
||||
CHECK_MKL(dnnBatchNormalizationCreateBackward_v2<ElemType>(
|
||||
CHECK_MKL(dnnLayoutCreate<StatType>(<UserInput, inoutDim, inoutSizes, inoutStrides));
|
||||
CHECK_MKL(dnnLayoutCreate<StatType>(<UserOutput, inoutDim, inoutSizes, inoutStrides));
|
||||
CHECK_MKL(dnnPrimitiveAttributesCreate<StatType>(&ctx.attributes));
|
||||
CHECK_MKL(dnnBatchNormalizationCreateBackward_v2<StatType>(
|
||||
&ctx.primitive,
|
||||
ctx.attributes,
|
||||
ltUserInput,
|
||||
|
@ -311,16 +314,16 @@ private:
|
|||
inputType = dnnResourceDiffDst;
|
||||
outputType = dnnResourceDiffSrc;
|
||||
scaleShiftType = dnnResourceDiffScaleShift;
|
||||
ctx.varianceMat = std::make_shared<Mat>(numChannels, 1, CPUDEVICE);
|
||||
ctx.varianceMat = std::make_shared<StatMat>(numChannels, 1, CPUDEVICE);
|
||||
break;
|
||||
default:
|
||||
RuntimeError("Unexpected context type %d", (int)contextIndex);
|
||||
}
|
||||
|
||||
CHECK_MKL(dnnLayoutCreateFromPrimitive<ElemType>(<PrimInput, ctx.primitive, inputType));
|
||||
CHECK_MKL(dnnLayoutCreateFromPrimitive<StatType>(<PrimInput, ctx.primitive, inputType));
|
||||
ctx.input.Create(ltUserInput, ltPrimInput, inputType, true);
|
||||
|
||||
CHECK_MKL(dnnLayoutCreateFromPrimitive<ElemType>(<PrimOutput, ctx.primitive, outputType));
|
||||
CHECK_MKL(dnnLayoutCreateFromPrimitive<StatType>(<PrimOutput, ctx.primitive, outputType));
|
||||
ctx.output.Create(ltUserOutput, ltPrimOutput, outputType, false);
|
||||
|
||||
ctx.scaleShift.Create(scaleShiftType, contextIndex != ContextIndex_Backward, numChannels);
|
||||
|
@ -338,7 +341,7 @@ private:
|
|||
resources[dnnResourceMean] = runMean;
|
||||
resources[dnnResourceVariance] = runVariance;
|
||||
|
||||
CHECK_MKL(dnnExecute<ElemType>(ctx.primitive, resources));
|
||||
CHECK_MKL(dnnExecute<StatType>(ctx.primitive, resources));
|
||||
|
||||
ctx.output.ConvertOutput(output);
|
||||
}
|
||||
|
@ -352,16 +355,16 @@ private:
|
|||
ctx.output.PrepareForExecution(grad, resources);
|
||||
ctx.scaleShift.PrepareForExecution(scaleGrad, biasGrad, resources);
|
||||
|
||||
std::shared_ptr<Mat> scaleShiftMat;
|
||||
scaleShiftMat = std::make_shared<Mat>(ctx.scaleShift.numChannels, 2, CPUDEVICE);
|
||||
memcpy(scaleShiftMat->Data(), scale, ctx.scaleShift.numChannels * sizeof(ElemType));
|
||||
std::shared_ptr<StatMat> scaleShiftMat;
|
||||
scaleShiftMat = std::make_shared<StatMat>(ctx.scaleShift.numChannels, 2, CPUDEVICE);
|
||||
memcpy(scaleShiftMat->Data(), scale, ctx.scaleShift.numChannels * sizeof(StatType));
|
||||
resources[dnnResourceScaleShift] = scaleShiftMat->Data();
|
||||
|
||||
// convert from InvStdDev to variance
|
||||
for (size_t i = 0; i < ctx.scaleShift.numChannels; i++)
|
||||
{
|
||||
ElemType& v = ctx.varianceMat->Data()[i];
|
||||
ElemType& s = ((ElemType*)savedInvStdDev)[i];
|
||||
StatType& v = ctx.varianceMat->Data()[i];
|
||||
StatType& s = ((StatType*)savedInvStdDev)[i];
|
||||
v = (1 / (s * s) - m_epsilon);
|
||||
}
|
||||
|
||||
|
@ -369,7 +372,7 @@ private:
|
|||
resources[dnnResourceMean] = savedMean;
|
||||
resources[dnnResourceVariance] = ctx.varianceMat->Data();
|
||||
|
||||
CHECK_MKL(dnnExecute<ElemType>(ctx.primitive, resources));
|
||||
CHECK_MKL(dnnExecute<StatType>(ctx.primitive, resources));
|
||||
|
||||
ctx.output.ConvertOutput(grad);
|
||||
ctx.scaleShift.ConvertOutput(scaleGrad, biasGrad);
|
||||
|
@ -377,15 +380,15 @@ private:
|
|||
};
|
||||
|
||||
MKLBatchNormalizationContext m_mklContext;
|
||||
std::shared_ptr<Mat> m_dataGradWorkspace;
|
||||
std::shared_ptr<StatMat> m_dataGradWorkspace;
|
||||
|
||||
bool ForwardCoreMKL(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev)
|
||||
bool ForwardCoreMKL(const StatMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, StatMat& runMean, StatMat& runVariance,
|
||||
StatMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev)
|
||||
{
|
||||
ContextIndex contextIndex = inferenceOnly ?
|
||||
ContextIndex_ForwardInfer :
|
||||
ContextIndex_ForwardTrain;
|
||||
m_mklContext.Prepare(m_inOutT, m_spatial, in.GetNumCols(), contextIndex, (ElemType)epsilon);
|
||||
m_mklContext.Prepare(m_inOutT, m_spatial, in.GetNumCols(), contextIndex, (StatType)epsilon);
|
||||
|
||||
if (inferenceOnly)
|
||||
{
|
||||
|
@ -398,33 +401,33 @@ private:
|
|||
m_mklContext.Forward(in.Data(), out.Data(), scale.Data(), bias.Data(), savedMean.Data(), savedInvStdDev.Data(), contextIndex);
|
||||
|
||||
// update savedMean, savedInvStdDev
|
||||
ElemType OneMinusExpAvgFactor = (ElemType)(1.0 - expAvgFactor);
|
||||
cblas_axpby((MKL_INT)runMean.GetNumElements(), (ElemType)expAvgFactor, savedMean.Data(), OneMinusExpAvgFactor, runMean.Data());
|
||||
StatType OneMinusExpAvgFactor = (StatType)(1.0 - expAvgFactor);
|
||||
cblas_axpby((MKL_INT)runMean.GetNumElements(), (StatType)expAvgFactor, savedMean.Data(), OneMinusExpAvgFactor, runMean.Data());
|
||||
|
||||
// note savedInvStdDev currently hold variance of in.Data(), need to convert to InvStdDev and interpolate
|
||||
ElemType numReduced = (ElemType)(in.GetNumElements() / runVariance.GetNumElements());
|
||||
ElemType bcf = numReduced / (numReduced - 1);
|
||||
StatType numReduced = (StatType)(in.GetNumElements() / runVariance.GetNumElements());
|
||||
StatType bcf = numReduced / (numReduced - 1);
|
||||
for (size_t i = 0; i < runVariance.GetNumElements(); i++)
|
||||
{
|
||||
ElemType& v = runVariance.Data()[i];
|
||||
ElemType& s = savedInvStdDev.Data()[i];
|
||||
v = v * OneMinusExpAvgFactor + bcf * s * (ElemType)expAvgFactor;
|
||||
s = (ElemType)1 / sqrt(s + (ElemType)epsilon);
|
||||
StatType& v = runVariance.Data()[i];
|
||||
StatType& s = savedInvStdDev.Data()[i];
|
||||
v = v * OneMinusExpAvgFactor + bcf * s * (StatType)expAvgFactor;
|
||||
s = (StatType)1 / sqrt(s + (StatType)epsilon);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool BackwardCoreMKL(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale,
|
||||
const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad)
|
||||
bool BackwardCoreMKL(const StatMat& in, const StatMat& srcGrad, StatMat& grad, const StatMat& scale,
|
||||
const StatMat& savedMean, const StatMat& savedInvStdDev, StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad)
|
||||
{
|
||||
m_mklContext.Prepare(m_inOutT, m_spatial, srcGrad.GetNumCols(), ContextIndex_Backward);
|
||||
|
||||
if (accumulateDataGrad)
|
||||
{
|
||||
if (!m_dataGradWorkspace)
|
||||
m_dataGradWorkspace = std::make_shared<Matrix<ElemType>>(0, 0, CPUDEVICE);
|
||||
m_dataGradWorkspace = std::make_shared<Matrix<StatType>>(0, 0, CPUDEVICE);
|
||||
|
||||
m_dataGradWorkspace->SetValue(grad);
|
||||
}
|
||||
|
@ -432,23 +435,24 @@ private:
|
|||
m_mklContext.Backward(in.Data(), srcGrad.Data(), grad.Data(), scale.Data(), savedMean.Data(), savedInvStdDev.Data(), scaleGrad.Data(), biasGrad.Data());
|
||||
|
||||
if (accumulateDataGrad)
|
||||
cblas_axpby((MKL_INT)grad.GetNumElements(), (ElemType)1.0, m_dataGradWorkspace->Data(), (ElemType)1.0, grad.Data());
|
||||
cblas_axpby((MKL_INT)grad.GetNumElements(), (StatType)1.0, m_dataGradWorkspace->Data(), (StatType)1.0, grad.Data());
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
template class CntkBatchNormEngine<float>;
|
||||
template class CntkBatchNormEngine<double>;
|
||||
template class CntkBatchNormEngine<float, float>;
|
||||
template class CntkBatchNormEngine<double, double>;
|
||||
template class CntkBatchNormEngine<half, float>;
|
||||
|
||||
template <typename T> bool HasFlag(T src, T testFlag)
|
||||
{
|
||||
return ((int)src & (int)testFlag) != 0;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
template <class InoutType, class StatType>
|
||||
std::unique_ptr<BatchNormEngine<InoutType, StatType>> BatchNormEngine<InoutType, StatType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout,
|
||||
BatchNormEngineKind enabledEngines)
|
||||
{
|
||||
|
@ -458,7 +462,7 @@ std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEV
|
|||
if (GetMathLibTraceLevel() > 0)
|
||||
fprintf(stderr, "Using CNTK batch normalization engine.\n");
|
||||
|
||||
return std::make_unique<CntkBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
|
||||
return std::make_unique<CntkBatchNormEngine<InoutType, StatType>>(deviceId, inOutT, spatial, imageLayout);
|
||||
}
|
||||
|
||||
if (HasFlag(enabledEngines, BatchNormEngineKind::CuDnn))
|
||||
|
@ -466,13 +470,14 @@ std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEV
|
|||
if (GetMathLibTraceLevel() > 0)
|
||||
fprintf(stderr, "Using cuDNN batch normalization engine.\n");
|
||||
|
||||
return CuDnnBatchNormEngineFactory<ElemType>::Create(deviceId, inOutT, spatial, imageLayout);
|
||||
return CuDnnBatchNormEngineFactory<InoutType, StatType>::Create(deviceId, inOutT, spatial, imageLayout);
|
||||
}
|
||||
|
||||
RuntimeError("Could not find appropriate batch normalization engine.");
|
||||
}
|
||||
|
||||
template class BatchNormEngine<float>;
|
||||
template class BatchNormEngine<double>;
|
||||
template class BatchNormEngine<float, float>;
|
||||
template class BatchNormEngine<double, double>;
|
||||
template class BatchNormEngine<half, float>;
|
||||
|
||||
}}}
|
||||
|
|
|
@ -25,22 +25,23 @@ enum class BatchNormEngineKind
|
|||
#pragma warning(push)
|
||||
#pragma warning(disable : 4251)
|
||||
|
||||
template <class ElemType>
|
||||
template <class InoutType, class StatType = InoutType>
|
||||
class MATH_API BatchNormEngine
|
||||
{
|
||||
public:
|
||||
using Mat = Matrix<ElemType>;
|
||||
using InoutMat = Matrix<InoutType>;
|
||||
using StatMat = Matrix<StatType>;
|
||||
|
||||
public:
|
||||
virtual ~BatchNormEngine() {};
|
||||
|
||||
void Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
|
||||
void Forward(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
|
||||
InoutMat& out, double epsilon, StatMat& saveMean, StatMat& saveInvStdDev);
|
||||
|
||||
void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad);
|
||||
void Backward(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& saveMean, const StatMat& saveInvStdDev,
|
||||
StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad);
|
||||
|
||||
static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
static std::unique_ptr<BatchNormEngine<InoutType, StatType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout,
|
||||
BatchNormEngineKind enabledEngines = BatchNormEngineKind::All);
|
||||
|
||||
|
@ -56,11 +57,11 @@ protected:
|
|||
virtual void EnsureCompatible() = 0;
|
||||
|
||||
// saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched
|
||||
virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
|
||||
virtual void ForwardCore(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
|
||||
InoutMat& out, double epsilon, StatMat& saveMean, StatMat& saveInvStdDev) = 0;
|
||||
|
||||
virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) = 0;
|
||||
virtual void BackwardCore(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& saveMean, const StatMat& saveInvStdDev,
|
||||
StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) = 0;
|
||||
|
||||
protected:
|
||||
DEVICEID_TYPE m_deviceId;
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include <ctime>
|
||||
#include <limits.h>
|
||||
#include "QuantizedOperations.h"
|
||||
#include "half.hpp"
|
||||
|
||||
//#include "GPUMatrix.h"
|
||||
//#include "CPUSparseMatrix.h"
|
||||
|
@ -108,7 +109,9 @@ public:
|
|||
const bool needAveMultiplier,
|
||||
const bool initialized);
|
||||
|
||||
void AdaDelta(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon);
|
||||
template<typename GradType>
|
||||
void AdaDelta(CPUMatrix<GradType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon);
|
||||
|
||||
void AdaDeltaFlushTimestamps(size_t cols, ElemType rho, int* timestamps, int currentTimestamp);
|
||||
|
||||
void Reshape(const size_t numRows, const size_t numCols);
|
||||
|
@ -417,10 +420,13 @@ public:
|
|||
void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
|
||||
CPUMatrix<ElemType>& grad, const bool poolIncludePad, bool accumulateGradient) const;
|
||||
|
||||
void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance,
|
||||
CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
|
||||
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
|
||||
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
|
||||
template<class StatType>
|
||||
void BatchNormalizationForward(const CPUMatrix<StatType>& scale, const CPUMatrix<StatType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<StatType>& runMean, CPUMatrix<StatType>& runVariance,
|
||||
CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<StatType>& saveMean, CPUMatrix<StatType>& saveInvStdDev) const;
|
||||
|
||||
template<class StatType>
|
||||
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<StatType>& scale, double blendFactor, const CPUMatrix<StatType>& saveMean, const CPUMatrix<StatType>& saveInvStdDev,
|
||||
CPUMatrix<StatType>& scaleGrad, CPUMatrix<StatType>& biasGrad) const;
|
||||
|
||||
public:
|
||||
// This functions do not depend on <ElemType>, i.e. you can call them on any <ElemType>
|
||||
|
@ -577,5 +583,30 @@ private:
|
|||
|
||||
typedef CPUMatrix<float> CPUSingleMatrix;
|
||||
typedef CPUMatrix<double> CPUDoubleMatrix;
|
||||
typedef CPUMatrix<half> CPUHalfMatrix;
|
||||
|
||||
template<typename ElemType>
|
||||
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
|
||||
template<typename ElemType>
|
||||
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 3>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
|
||||
|
||||
template<typename ElemType>
|
||||
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 4>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
|
||||
|
||||
template<typename ElemType>
|
||||
void CPUMatrixTensorArgOpImpl(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
|
||||
}}}
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
#include "stdafx.h"
|
||||
#include "CPUMatrixImpl.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// General conversion function with no performance optimization
|
||||
// this should only be used in CPU half precision
|
||||
// For performance on inference on CPU, user should convert fp16 model to fp32 first, unless MKL supports half precision
|
||||
template<typename SrcT, typename DstT>
|
||||
static void ConvertBuffer(DstT* dst, const SrcT* src, size_t count)
|
||||
{
|
||||
for (size_t i = 0; i < count; i++)
|
||||
{
|
||||
dst[i] = (DstT)src[i];
|
||||
}
|
||||
}
|
||||
|
||||
// specialization to convert from half to float for computation, and then store in half
|
||||
template <>
|
||||
void CPUMatrix<half>::MultiplyAndWeightedAdd(half alpha, const CPUMatrix<half>& a, const bool transposeA, const CPUMatrix<half>& b, const bool transposeB,
|
||||
half beta, CPUMatrix<half>& c, shared_ptr<QuantizedMultiplier<half>> pQuantizedMultiplier)
|
||||
{
|
||||
CPUMatrix<float> af(a.GetNumRows(), a.GetNumCols());
|
||||
CPUMatrix<float> bf(b.GetNumRows(), b.GetNumCols());
|
||||
CPUMatrix<float> cf(c.GetNumRows(), c.GetNumCols());
|
||||
|
||||
if (alpha != 0)
|
||||
{
|
||||
ConvertBuffer<half, float>(af.Data(), a.Data(), a.GetNumElements());
|
||||
ConvertBuffer<half, float>(bf.Data(), b.Data(), b.GetNumElements());
|
||||
}
|
||||
|
||||
if (beta != 0)
|
||||
{
|
||||
ConvertBuffer<half, float>(cf.Data(), c.Data(), c.GetNumElements());
|
||||
}
|
||||
|
||||
if (pQuantizedMultiplier)
|
||||
RuntimeError("Quantized matrix multiply not supported for Half");
|
||||
|
||||
CPUMatrix<float>::MultiplyAndWeightedAdd((float)alpha, af, transposeA, bf, transposeB, (float)beta, cf, nullptr);
|
||||
|
||||
ConvertBuffer<float, half>(c.Data(), cf.Data(), c.GetNumElements());
|
||||
}
|
||||
|
||||
// specialization to RunTimeError for now due to omp implementation only support build-in type
|
||||
template <>
|
||||
void CPUMatrix<half>::AssignSoftmaxSum(const CPUMatrix<half>& softmax, CPUMatrix<half>& c)
|
||||
{
|
||||
RuntimeError("half AssignSoftmaxSum not supported.");
|
||||
}
|
||||
|
||||
template <>
|
||||
void CPUMatrix<half>::AssignNCEUnnormalizedEval(const CPUMatrix<half>& a,
|
||||
const CPUMatrix<half>& b, const CPUMatrix<half>& bias, CPUMatrix<half>& c)
|
||||
{
|
||||
RuntimeError("half AssignNCEUnnormalizedEval not supported.");
|
||||
}
|
||||
|
||||
template <>
|
||||
void CPUMatrix<half>::VectorSum(const CPUMatrix<half>& a, CPUMatrix<half>& c, const bool isColWise)
|
||||
{
|
||||
RuntimeError("half VectorSum not supported.");
|
||||
}
|
||||
|
||||
template <>
|
||||
void CPUMatrix<half>::VectorNorm1(CPUMatrix<half>& c, const bool isColWise) const
|
||||
{
|
||||
RuntimeError("half VectorNorm1 not supported.");
|
||||
}
|
||||
|
||||
template <>
|
||||
half CPUMatrix<half>::SumOfElements() const
|
||||
{
|
||||
RuntimeError("half SumOfElements not supported.");
|
||||
}
|
||||
|
||||
template <>
|
||||
half CPUMatrix<half>::MatrixNorm1() const
|
||||
{
|
||||
RuntimeError("half MatrixNorm1 not supported.");
|
||||
}
|
||||
|
||||
template <>
|
||||
half CPUMatrix<half>::FrobeniusNorm() const
|
||||
{
|
||||
RuntimeError("half FrobeniusNorm not supported.");
|
||||
}
|
||||
|
||||
template <>
|
||||
void CPUMatrix<half>::MaxPoolingBackward(const CPUMatrix<half>& out, const CPUMatrix<half>& in,
|
||||
const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
|
||||
CPUMatrix<half>& grad, bool accumulateGradient) const
|
||||
{
|
||||
RuntimeError("half MaxPoolingBackward not supported.");
|
||||
}
|
||||
|
||||
template <>
|
||||
void CPUMatrix<half>::MaxROIPoolingBackward(const size_t numRois, const size_t numImg, const size_t channels, const size_t width, const size_t height,
|
||||
const size_t pooledWidth, const size_t pooledHeight, const CPUMatrix<half>& roiData, CPUMatrix<half>& grad,
|
||||
CPUMatrix<half>& argmax, double spatialScale) const
|
||||
{
|
||||
RuntimeError("half MaxROIPoolingBackward not supported.");
|
||||
}
|
||||
|
||||
template <>
|
||||
void CPUMatrix<half>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<half>& grad, const bool poolIncludePad, bool accumulateGradient) const
|
||||
{
|
||||
RuntimeError("half AveragePoolingBackward not supported.");
|
||||
}
|
||||
|
||||
// explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash.
|
||||
template class MATH_API CPUMatrix<half>;
|
||||
|
||||
// instantiate templated methods
|
||||
template void CPUMatrix<float>::AdaDelta(CPUMatrix<float>& gradients, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon);
|
||||
template void CPUMatrix<double>::AdaDelta(CPUMatrix<double>& gradients, CPUMatrix<double>& functionValues, double learningRate, double rho, double epsilon);
|
||||
template void CPUMatrix<float>::AdaDelta(CPUMatrix<half>& gradients, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon);
|
||||
|
||||
template void CPUMatrix<float>::BatchNormalizationForward(const CPUMatrix<float>& scale, const CPUMatrix<float>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<float>& runMean, CPUMatrix<float>& runVariance, CPUMatrix<float>& out, double epsilon, CPUMatrix<float>& saveMean, CPUMatrix<float>& saveInvStdDev) const;
|
||||
template void CPUMatrix<double>::BatchNormalizationForward(const CPUMatrix<double>& scale, const CPUMatrix<double>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<double>& runMean, CPUMatrix<double>& runVariance, CPUMatrix<double>& out, double epsilon, CPUMatrix<double>& saveMean, CPUMatrix<double>& saveInvStdDev) const;
|
||||
template void CPUMatrix<half>::BatchNormalizationForward(const CPUMatrix<float>& scale, const CPUMatrix<float>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<float>& runMean, CPUMatrix<float>& runVariance, CPUMatrix<half>& out, double epsilon, CPUMatrix<float>& saveMean, CPUMatrix<float>& saveInvStdDev) const;
|
||||
|
||||
template void CPUMatrix<float>::BatchNormalizationBackward(const CPUMatrix<float>& in, CPUMatrix<float>& grad, const CPUMatrix<float>& scale, double blendFactor, const CPUMatrix<float>& saveMean, const CPUMatrix<float>& saveInvStdDev, CPUMatrix<float>& scaleGrad, CPUMatrix<float>& biasGrad) const;
|
||||
template void CPUMatrix<double>::BatchNormalizationBackward(const CPUMatrix<double>& in, CPUMatrix<double>& grad, const CPUMatrix<double>& scale, double blendFactor, const CPUMatrix<double>& saveMean, const CPUMatrix<double>& saveInvStdDev, CPUMatrix<double>& scaleGrad, CPUMatrix<double>& biasGrad) const;
|
||||
template void CPUMatrix<half>::BatchNormalizationBackward(const CPUMatrix<half>& in, CPUMatrix<half>& grad, const CPUMatrix<float>& scale, double blendFactor, const CPUMatrix<float>& saveMean, const CPUMatrix<float>& saveInvStdDev, CPUMatrix<float>& scaleGrad, CPUMatrix<float>& biasGrad) const;
|
||||
|
||||
}}}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,30 @@
|
|||
#include "stdafx.h"
|
||||
#include "CPUMatrixTensorImpl.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template
|
||||
void CPUMatrixTensorOpImpl(double beta, const CPUMatrix<double>& a, CPUMatrix<double>& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
|
||||
template
|
||||
void CPUMatrixTensorOpImpl(double beta, const CPUMatrix<double>& a, const CPUMatrix<double>& b, CPUMatrix<double>& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 3>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
|
||||
|
||||
template
|
||||
void CPUMatrixTensorOpImpl(double beta, const CPUMatrix<double>& a, const CPUMatrix<double>& b, const CPUMatrix<double>& c, CPUMatrix<double>& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 4>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
|
||||
|
||||
template
|
||||
void CPUMatrixTensorArgOpImpl(const CPUMatrix<double>& a, CPUMatrix<double>& o, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
|
||||
}}}
|
|
@ -0,0 +1,30 @@
|
|||
#include "stdafx.h"
|
||||
#include "CPUMatrixTensorImpl.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template
|
||||
void CPUMatrixTensorOpImpl(float beta, const CPUMatrix<float>& a, CPUMatrix<float>& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
|
||||
template
|
||||
void CPUMatrixTensorOpImpl(float beta, const CPUMatrix<float>& a, const CPUMatrix<float>& b, CPUMatrix<float>& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 3>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
|
||||
|
||||
template
|
||||
void CPUMatrixTensorOpImpl(float beta, const CPUMatrix<float>& a, const CPUMatrix<float>& b, const CPUMatrix<float>& c, CPUMatrix<float>& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 4>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
|
||||
|
||||
template
|
||||
void CPUMatrixTensorArgOpImpl(const CPUMatrix<float>& a, CPUMatrix<float>& o, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
|
||||
}}}
|
|
@ -0,0 +1,30 @@
|
|||
#include "stdafx.h"
|
||||
#include "CPUMatrixTensorImpl.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template
|
||||
void CPUMatrixTensorOpImpl(half beta, const CPUMatrix<half>& a, CPUMatrix<half>& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
|
||||
template
|
||||
void CPUMatrixTensorOpImpl(half beta, const CPUMatrix<half>& a, const CPUMatrix<half>& b, CPUMatrix<half>& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 3>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
|
||||
|
||||
template
|
||||
void CPUMatrixTensorOpImpl(half beta, const CPUMatrix<half>& a, const CPUMatrix<half>& b, const CPUMatrix<half>& c, CPUMatrix<half>& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 4>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
|
||||
|
||||
template
|
||||
void CPUMatrixTensorArgOpImpl(const CPUMatrix<half>& a, CPUMatrix<half>& o, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
|
||||
}}}
|
|
@ -0,0 +1,519 @@
|
|||
// Move some files out of CPUMatrixImpl.h to prevent compiler crash on out-of-heap
|
||||
|
||||
#include "CPUMatrix.h"
|
||||
#include "TensorOps.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// =======================================================================
|
||||
// TensorView support
|
||||
// =======================================================================
|
||||
|
||||
// To save time, this makes extensive use of templates and macros.
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// function to compute the value for a given output location (perform reduction if needed)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// perform loop over reduction index m
|
||||
// This function is declared inside a wrapper struct to allow partial specialization (m = -1).
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int m>
|
||||
struct TensorOpReduction
|
||||
{
|
||||
// reduction case (non-reduction case is specialized)
|
||||
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
array<ptrdiff_t, N - 1> strides; // N-1 because last one is the result pointer, which is unused in reduction
|
||||
for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled
|
||||
strides[i] = reducingStrides[i][(size_t) m];
|
||||
|
||||
double aggregate = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
|
||||
for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;)
|
||||
{
|
||||
// advance the pointers
|
||||
for (size_t i = 0; i < N - 1; i++)
|
||||
pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here
|
||||
|
||||
// need to descend into one loop deeper
|
||||
aggregate = reductionOp(aggregate, TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides));
|
||||
}
|
||||
// Actually it would be nicer to return double but we keep ElementType so that test don't return different numbers than previous implementation.
|
||||
return static_cast<ElemType>(aggregate);
|
||||
}
|
||||
};
|
||||
|
||||
// perform loop over reduction index m
|
||||
// This is the specialized version for m = -1, which terminates the recursion.
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
|
||||
struct TensorOpReduction<ElemType, OPFN, ReductionOp, N, -1>
|
||||
{
|
||||
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& /*reductionOp*/,
|
||||
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&)
|
||||
{
|
||||
return opfn(pointers); // finally we are doing some work!!!
|
||||
}
|
||||
};
|
||||
|
||||
// perform loop over reduction index m, while keeping track of the number of elements and their corresponding indices.
|
||||
// This function is declared inside a wrapper struct to allow partial specialization (m = -1).
|
||||
template <class ElemType, size_t N, int m>
|
||||
struct TensorArgOpReduction
|
||||
{
|
||||
static inline std::pair<ElemType, size_t> ReduceAll(array<ElemType*, N> pointers, const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides,
|
||||
ElementWiseOperator reductionOp)
|
||||
{
|
||||
size_t counter = 0;
|
||||
size_t index = 0;
|
||||
ElemType val = (ElemType)0;
|
||||
|
||||
switch (reducingOpDims.size())
|
||||
{
|
||||
case 3:
|
||||
val = TensorArgOpReduction<ElemType, N, 2>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
|
||||
break;
|
||||
case 2:
|
||||
val = TensorArgOpReduction<ElemType, N, 1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
|
||||
break;
|
||||
case 1:
|
||||
val = TensorArgOpReduction<ElemType, N, 0>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
|
||||
break;
|
||||
case 0:
|
||||
val = TensorArgOpReduction<ElemType, N, -1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
|
||||
break;
|
||||
default:
|
||||
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)reducingOpDims.size());
|
||||
}
|
||||
|
||||
return make_pair(val, index);
|
||||
}
|
||||
|
||||
// reduction case (non-reduction case is specialized)
|
||||
static inline ElemType Loop(array<ElemType*, N> pointers, const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides,
|
||||
ElementWiseOperator reductionOp, size_t& counter, size_t& index)
|
||||
{
|
||||
array<ptrdiff_t, N - 1> strides; // N-1 because last one is the result pointer, which is unused in reduction
|
||||
for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled
|
||||
strides[i] = reducingStrides[i][(size_t)m];
|
||||
|
||||
ElemType aggregate = TensorArgOpReduction<ElemType, N, m - 1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
|
||||
for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;)
|
||||
{
|
||||
// advance the pointers
|
||||
for (size_t i = 0; i < N - 1; i++)
|
||||
pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here
|
||||
|
||||
ElemType val = TensorArgOpReduction<ElemType, N, m - 1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
|
||||
|
||||
bool update = false;
|
||||
switch (reductionOp)
|
||||
{
|
||||
case ElementWiseOperator::opArgmin:
|
||||
update = (aggregate > val);
|
||||
break;
|
||||
case ElementWiseOperator::opArgmax:
|
||||
update = (aggregate < val);
|
||||
break;
|
||||
}
|
||||
|
||||
if (update)
|
||||
{
|
||||
aggregate = val;
|
||||
index = counter - 1;
|
||||
}
|
||||
}
|
||||
|
||||
return aggregate;
|
||||
}
|
||||
};
|
||||
|
||||
// perform loop over reduction index m
|
||||
// This is the specialized version for m = -1, which terminates the recursion.
|
||||
template <class ElemType, size_t N>
|
||||
struct TensorArgOpReduction<ElemType, N, -1>
|
||||
{
|
||||
static inline ElemType Loop(array<ElemType*, N> pointers,
|
||||
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&, ElementWiseOperator /*reductionOp*/, size_t& counter, size_t& /*index*/)
|
||||
{
|
||||
counter++;
|
||||
return *pointers[0]; // finally we are doing some work!!!
|
||||
}
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// perform loop over regular index k for N-nary operations (N counting the output)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// perform loop over regular index k and reducing index m for N operands (counting the output)
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m, int k>
|
||||
struct TensorOpIteration
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
// non-scalar case: still nested result loops left
|
||||
array<ptrdiff_t, N> strides;
|
||||
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
|
||||
strides[i] = regularStrides[i][(size_t) k];
|
||||
for (size_t dim = regularOpDims[(size_t) k]; dim-- > 0;)
|
||||
{
|
||||
// need to descend into one loop deeper
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
// advance the pointers
|
||||
for (size_t i = 0; i < N; i++)
|
||||
pointers[i] += strides[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE.
|
||||
// This is a very common case, e.g. adding vectors or computing the Sigmoid.
|
||||
template <class ElemType, typename OPFN, typename ReductionOp>
|
||||
struct TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
|
||||
{
|
||||
ElemType* pa = pointers[0];
|
||||
ElemType* pb = pointers[1];
|
||||
ElemType* pc = pointers[2];
|
||||
size_t K = regularOpDims[0];
|
||||
// special-case beta and alpha to allow the compiler to short-circuit it
|
||||
if (beta != 0)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else if (alpha != 1)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
// TODO: According to Amit, the VS compiler is not able to vectorize into lambdas. Solution: change the lambda to take an N, or to implement the loop inside (with 1 element by default).
|
||||
// TODO: The signedness of k (required for omp) causes an extra sign-extend.
|
||||
// TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it?
|
||||
}
|
||||
};
|
||||
// and unary
|
||||
template <class ElemType, typename OPFN, typename ReductionOp>
|
||||
struct TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
|
||||
{
|
||||
ElemType* pa = pointers[0];
|
||||
ElemType* pb = pointers[1];
|
||||
size_t K = regularOpDims[0];
|
||||
// special-case beta and alpha to allow the compiler to short-circuit it
|
||||
if (beta != 0)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else if (alpha != 1)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
};
|
||||
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m>
|
||||
struct TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, -1>
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
// we are at element level for the result: perform the op (there may still be reduction)
|
||||
ElemType val = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
|
||||
// scale
|
||||
val *= alpha;
|
||||
// combine with previous value in target matrix, then write it out
|
||||
auto* pout = pointers.back();
|
||||
if (beta != 0)
|
||||
val += beta * *pout;
|
||||
// save
|
||||
*pout = val;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// perform loop over regular index k and reducing index m for N operands (counting the output), the difference
|
||||
// between TensorOpIteration and TensorArgOpIteration, is that the latter store the index of the result, instead of
|
||||
// the result. The reason that they aren't combined is because of performance.
|
||||
template <class ElemType, size_t N, int k>
|
||||
struct TensorArgOpIteration
|
||||
{
|
||||
static inline void Loop(array<ElemType*, N> pointers,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, ElementWiseOperator reductionOp)
|
||||
{
|
||||
// non-scalar case: still nested result loops left
|
||||
array<ptrdiff_t, N> strides;
|
||||
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
|
||||
strides[i] = regularStrides[i][(size_t)k];
|
||||
for (size_t dim = regularOpDims[(size_t)k]; dim-- > 0;)
|
||||
{
|
||||
// need to descend into one loop deeper
|
||||
TensorArgOpIteration<ElemType, N, k - 1>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
|
||||
// advance the pointers
|
||||
for (size_t i = 0; i < N; i++)
|
||||
pointers[i] += strides[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <class ElemType, size_t N>
|
||||
struct TensorArgOpIteration<ElemType, N, -1>
|
||||
{
|
||||
static inline void Loop(array<ElemType*, N> pointers,
|
||||
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, ElementWiseOperator reductionOp)
|
||||
{
|
||||
// we are at element level for the result: perform the op (there may still be reduction)
|
||||
auto val = TensorArgOpReduction<ElemType, N, 2>::ReduceAll(pointers, reducingOpDims, reducingStrides, reductionOp);
|
||||
|
||||
auto* pout = pointers.back();
|
||||
*pout = (ElemType)val.second;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// map runtime parameters N to template parameters
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// tensor operation with k+1 dimensions (-1 means scalar)
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int k>
|
||||
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, const OPFN& opfn, ReductionOp reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
size_t dims = reducingOpDims.size();
|
||||
switch (dims)
|
||||
{
|
||||
case 2:
|
||||
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1:
|
||||
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0:
|
||||
{
|
||||
// if all leading dimensions are 1, we can let the compiler do some unrolling
|
||||
bool leadingAllOne = true;
|
||||
for (size_t i = 0; i < N; i++)
|
||||
leadingAllOne &= k >= 0 && regularStrides[i][0] == 1;
|
||||
if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions
|
||||
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, true /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else
|
||||
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
default:
|
||||
LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int) dims);
|
||||
}
|
||||
}
|
||||
|
||||
// tensor operation, generalized in number of arguments, operation already provided as a lambda
|
||||
// This function now expands into different k.
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
|
||||
static void TensorOpWithFnAndReduction(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const array<size_t, N>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
|
||||
pointers[i] += offsets[i];
|
||||
size_t dims = regularOpDims.size();
|
||||
switch (dims)
|
||||
{
|
||||
// N.B. consider code size impact when adding more cases.
|
||||
case 5:
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 4>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 4:
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 3>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 3:
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 2>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 2:
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1:
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 0>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0:
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, -1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
default:
|
||||
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims);
|
||||
}
|
||||
}
|
||||
|
||||
// tensor operation, generalized in number of arguments, operation already provided as a lambda
|
||||
// This function now expands into different reductionOps
|
||||
template <class ElemType, typename OPFN, size_t N>
|
||||
static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, ElementWiseOperator reductionOp,
|
||||
const array<size_t, N>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
// BUGBUG: Using always 'double' as type of aggregator even for ElemType==float. Reason: otherwise some e2e test would fail as historically we
|
||||
// used double for aggregator of sum. But:
|
||||
// * for min and max reductions this is meaningless.
|
||||
// * It is not consitent with what we do on GPU, there we aggregate on ElemType.
|
||||
// * It costs performance.
|
||||
// TODO: apdapt e2e tests to run with aggregator of type ElemType.
|
||||
#define CaseTensorOpWithFnAndReduction(oper) \
|
||||
case ElementWiseOperator::op##oper: \
|
||||
return TensorOpWithFnAndReduction(beta, pointers, alpha, opfn, [](double a, double b) \
|
||||
{ \
|
||||
return Op##oper(a, b); \
|
||||
}, \
|
||||
offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
|
||||
switch (reductionOp)
|
||||
{
|
||||
CaseTensorOpWithFnAndReduction(Sum);
|
||||
CaseTensorOpWithFnAndReduction(LogSum);
|
||||
CaseTensorOpWithFnAndReduction(Min);
|
||||
CaseTensorOpWithFnAndReduction(Max);
|
||||
CaseTensorOpWithFnAndReduction(ElementwiseProduct);
|
||||
default:
|
||||
LogicError("Specified ElementWiseOperator op %d not supported as reduction operation.", (int)reductionOp);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// entry points from Matrix.cpp; also map op to a lambda
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
|
||||
// This maps 'op' to a lambda.
|
||||
template <class ElemType>
|
||||
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
|
||||
{
|
||||
if (reductionOp != ElementWiseOperator::opSum &&
|
||||
reductionOp != ElementWiseOperator::opLogSum &&
|
||||
reductionOp != ElementWiseOperator::opMin &&
|
||||
reductionOp != ElementWiseOperator::opMax &&
|
||||
reductionOp != ElementWiseOperator::opElementwiseProduct)
|
||||
InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
|
||||
|
||||
// TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
|
||||
#define CaseUnaryTensorOp(oper) \
|
||||
case ElementWiseOperator::op##oper: \
|
||||
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 2>& pp) \
|
||||
{ \
|
||||
return Op##oper((*(pp[0]))); \
|
||||
}, \
|
||||
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
|
||||
array<ElemType*, 2> pointers = {a.Data(), o.Data()};
|
||||
switch (op)
|
||||
{
|
||||
ForAllUnaryOps(CaseUnaryTensorOp);
|
||||
default:
|
||||
LogicError("TensorOp: Unknown unary op code %d.", (int) op);
|
||||
}
|
||||
}
|
||||
|
||||
// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
|
||||
// This maps 'op' to a lambda.
|
||||
template <class ElemType>
|
||||
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 3>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
|
||||
{
|
||||
if (reductionOp != ElementWiseOperator::opSum)
|
||||
InvalidArgument("TensorOp (binary): The only permitted binary reduction operation is opSum.");
|
||||
|
||||
#define CaseBinaryTensorOp(oper) \
|
||||
case ElementWiseOperator::op##oper: \
|
||||
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3>& pp) \
|
||||
{ \
|
||||
return Op##oper((*(pp[0])), (*(pp[1]))); \
|
||||
}, \
|
||||
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
|
||||
array<ElemType*, 3> pointers = {a.Data(), b.Data(), o.Data()};
|
||||
switch (op)
|
||||
{
|
||||
ForAllBinaryOps(CaseBinaryTensorOp);
|
||||
default:
|
||||
LogicError("TensorOp: Unknown op binary code %d.", (int) op);
|
||||
}
|
||||
}
|
||||
|
||||
// perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
|
||||
// This maps 'op' to a lambda.
|
||||
template <class ElemType>
|
||||
void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 4>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)
|
||||
{
|
||||
if (reductionOp != ElementWiseOperator::opSum)
|
||||
InvalidArgument("TensorOp: The only permitted ternary reduction operation is opSum.");
|
||||
|
||||
#define CaseTernaryTensorOp(oper) \
|
||||
case ElementWiseOperator::op##oper: \
|
||||
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4>& pp) \
|
||||
{ \
|
||||
return Op##oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); \
|
||||
}, \
|
||||
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
|
||||
array<ElemType*, 4> pointers = {a.Data(), b.Data(), c.Data(), o.Data()};
|
||||
switch (op)
|
||||
{
|
||||
ForAllTernaryOps(CaseTernaryTensorOp);
|
||||
default:
|
||||
LogicError("TensorOp: Unknown ternary op code %d.", (int) op);
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrixTensorArgOpImpl(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
|
||||
{
|
||||
if (reductionOp != ElementWiseOperator::opArgmin &&
|
||||
reductionOp != ElementWiseOperator::opArgmax)
|
||||
InvalidArgument("TensorOp: Arg reduction operations other than opArgmax, and opArgmin are not implemented.");
|
||||
|
||||
if (o.GetNumElements() == 1)
|
||||
{
|
||||
o.Data()[0] = (ElemType) a.ArgOp(reductionOp);
|
||||
}
|
||||
else
|
||||
{
|
||||
const size_t N = 2;
|
||||
array<ElemType*, N> pointers = { a.Data(), o.Data() };
|
||||
for (size_t i = 0; i < N; i++)
|
||||
pointers[i] += offsets[i];
|
||||
|
||||
switch (regularOpDims.size())
|
||||
{
|
||||
case 2:
|
||||
TensorArgOpIteration<ElemType, N, 1>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
|
||||
break;
|
||||
case 1:
|
||||
TensorArgOpIteration<ElemType, N, 0>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
|
||||
break;
|
||||
case 0:
|
||||
TensorArgOpIteration<ElemType, N, -1>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
|
||||
break;
|
||||
default:
|
||||
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)regularOpDims.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}}}
|
|
@ -20,6 +20,7 @@
|
|||
#ifdef LEAKDETECT
|
||||
#include <vld.h>
|
||||
#endif
|
||||
#include "half.hpp"
|
||||
|
||||
#pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
|
||||
|
||||
|
@ -531,7 +532,7 @@ void CPUSparseMatrix<ElemType>::Print(const char* matrixName, ptrdiff_t /*rowSta
|
|||
fprintf(stderr, "\n");
|
||||
j++;
|
||||
}
|
||||
fprintf(stderr, "%d:%.f ", unCompressedIndex[i], dataBuffer[i]);
|
||||
fprintf(stderr, "%d:%.f ", unCompressedIndex[i], (double)dataBuffer[i]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
@ -1423,7 +1424,8 @@ ElemType CPUSparseMatrix<ElemType>::Adagrad(CPUMatrix<ElemType>& c, const bool n
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon, int* timestamps, int currentTimestamp)
|
||||
template <class AccumType>
|
||||
void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<AccumType>& c, CPUMatrix<AccumType>& functionValues, AccumType learningRate, AccumType rho, AccumType epsilon, int* timestamps, int currentTimestamp)
|
||||
{
|
||||
size_t numColsNeeded = 2 * GetNumCols();
|
||||
|
||||
|
@ -1441,9 +1443,9 @@ void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemT
|
|||
|
||||
size_t n = GetNumElements();
|
||||
ElemType* grad = Data();
|
||||
ElemType* smoothAda = c.Data();
|
||||
ElemType* smoothX2 = c.Data() + n;
|
||||
ElemType* val = functionValues.Data();
|
||||
AccumType* smoothAda = c.Data();
|
||||
AccumType* smoothX2 = c.Data() + n;
|
||||
AccumType* val = functionValues.Data();
|
||||
auto rows = GetNumRows();
|
||||
|
||||
#pragma omp parallel for
|
||||
|
@ -1459,10 +1461,10 @@ void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemT
|
|||
{
|
||||
size_t denseIndex = columnOffset + row;;
|
||||
ElemType g = grad[blockOffset + row];
|
||||
ElemType adaSqr = rho * decay * smoothAda[denseIndex] + (1 - rho) * g * g;
|
||||
AccumType adaSqr = rho * decay * smoothAda[denseIndex] + (1 - rho) * g * g;
|
||||
smoothAda[denseIndex] = adaSqr;
|
||||
ElemType x2 = decay * smoothX2[denseIndex];
|
||||
ElemType deltaX = -sqrt(x2 + epsilon) / sqrt(adaSqr + epsilon) * g;
|
||||
AccumType x2 = decay * smoothX2[denseIndex];
|
||||
AccumType deltaX = -sqrt(x2 + epsilon) / sqrt(adaSqr + epsilon) * g;
|
||||
smoothX2[denseIndex] = rho * x2 + (1 - rho) * deltaX * deltaX;
|
||||
val[denseIndex] += learningRate * deltaX;
|
||||
}
|
||||
|
@ -1708,6 +1710,18 @@ ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
|
|||
return sum;
|
||||
}
|
||||
|
||||
// specialization to RunTimeError for now due to omp implementation only support build-in type
|
||||
template <>
|
||||
half CPUSparseMatrix<half>::FrobeniusNorm() const
|
||||
{
|
||||
RuntimeError("half FrobeniusNorm not supported.");
|
||||
}
|
||||
template <>
|
||||
half CPUSparseMatrix<half>::SumOfElements() const
|
||||
{
|
||||
RuntimeError("half SumOfElements not supported.");
|
||||
}
|
||||
|
||||
template <typename ElemType>
|
||||
MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
|
||||
{
|
||||
|
@ -1763,8 +1777,54 @@ MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
|
|||
template MATH_API File& operator>>(File& stream, CPUSparseMatrix<float>& us);
|
||||
template MATH_API File& operator>>(File& stream, CPUSparseMatrix<double>& us);
|
||||
|
||||
template <typename ElemType>
|
||||
MATH_API File& operator<<(File& stream, const CPUSparseMatrix<ElemType>& us)
|
||||
{
|
||||
if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR)
|
||||
NOT_IMPLEMENTED;
|
||||
|
||||
stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
|
||||
stream << sizeof(ElemType);
|
||||
stream << std::wstring(L"nnmatrix"); // Note this is needed for compatability, and could potentially be an empty string
|
||||
|
||||
size_t nz, numRows, numCols;
|
||||
size_t compressedSize = us.SecondaryIndexCount();
|
||||
int format = us.GetFormat();
|
||||
|
||||
stream << format << nz << numCols << numRows;
|
||||
|
||||
if (nz > 0)
|
||||
{
|
||||
ElemType* dataBuffer = us.NzValues();
|
||||
CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation();
|
||||
CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation();
|
||||
|
||||
for (size_t i = 0; i < nz; ++i)
|
||||
{
|
||||
stream << dataBuffer[i];
|
||||
}
|
||||
for (size_t i = 0; i < nz; ++i)
|
||||
{
|
||||
stream << unCompressedIndex[i];
|
||||
}
|
||||
for (size_t i = 0; i < compressedSize; ++i)
|
||||
{
|
||||
stream << compressedIndex[i];
|
||||
}
|
||||
}
|
||||
stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
|
||||
|
||||
return stream;
|
||||
}
|
||||
|
||||
template class CPUSparseMatrix<float>;
|
||||
template class CPUSparseMatrix<double>;
|
||||
template class CPUSparseMatrix<half>;
|
||||
|
||||
// instantiate learner methods
|
||||
template void CPUSparseMatrix<float>::AdaDelta(CPUMatrix<float>& c, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon, int* timestamps, int currentTimestamp);
|
||||
template void CPUSparseMatrix<double>::AdaDelta(CPUMatrix<double>& c, CPUMatrix<double>& functionValues, double learningRate, double rho, double epsilon, int* timestamps, int currentTimestamp);
|
||||
template void CPUSparseMatrix<half>::AdaDelta(CPUMatrix<float>& c, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon, int* timestamps, int currentTimestamp);
|
||||
|
||||
// We use Matrix<char> as the backing store for QuantizedMatrix
|
||||
// Let's explciitly instantiate the methods we need for that purpose
|
||||
|
|
|
@ -232,7 +232,9 @@ public:
|
|||
public:
|
||||
void NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum, ElemType unitGainFactor);
|
||||
ElemType Adagrad(CPUMatrix<ElemType>& c, const bool needAveMultiplier);
|
||||
void AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon, int* timestamps, int currentTimestamp);
|
||||
|
||||
template<typename AccumType>
|
||||
void AdaDelta(CPUMatrix<AccumType>& c, CPUMatrix<AccumType>& functionValues, AccumType learningRate, AccumType rho, AccumType epsilon, int* timestamps, int currentTimestamp);
|
||||
|
||||
public:
|
||||
CPUSparseMatrix<ElemType>& InplaceTruncateTop(const ElemType threshold);
|
||||
|
|
|
@ -1,23 +1,11 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4100) // 'identifier': unreferenced formal parameter
|
||||
#pragma warning(disable : 4127) // conditional expression is constant
|
||||
#pragma warning(disable : 4201) // nonstandard extension used: nameless struct/union
|
||||
#pragma warning(disable : 4458) // declaration of 'identifier' hides class member
|
||||
#pragma warning(disable : 4515) // 'namespace': namespace uses itself
|
||||
#endif
|
||||
#include <cub/cub.cuh>
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
size_t RoundUpToMultiple(size_t n, size_t blockSize)
|
||||
|
@ -40,7 +28,7 @@ cudaError_t GetLastCudaError()
|
|||
#endif
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
/*
|
||||
template <int U, typename T>
|
||||
__device__ __forceinline__ void LoadValues(const T* src, T dst[U])
|
||||
{
|
||||
|
@ -48,9 +36,17 @@ __device__ __forceinline__ void LoadValues(const T* src, T dst[U])
|
|||
for (int i = 0; i < U; i++)
|
||||
dst[i] = src[i];
|
||||
}
|
||||
*/
|
||||
template <int U, typename T1, typename T2>
|
||||
__device__ __forceinline__ void LoadValues(const T1* src, T2 dst[U])
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 0; i < U; i++)
|
||||
dst[i] = (T2)src[i];
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst[2])
|
||||
__device__ __forceinline__ void LoadValues<2, float, float>(const float* src, float dst[2])
|
||||
{
|
||||
// src must be aligned at 8 bytes boundary.
|
||||
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
|
||||
|
@ -60,7 +56,7 @@ __device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst
|
|||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst[4])
|
||||
__device__ __forceinline__ void LoadValues<4, float, float>(const float* src, float dst[4])
|
||||
{
|
||||
// src must be aligned at 16 bytes boundary.
|
||||
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
|
||||
|
@ -73,7 +69,7 @@ __device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst
|
|||
dst[2] = v.z;
|
||||
dst[3] = v.w;
|
||||
}
|
||||
|
||||
/*
|
||||
template <int U, typename T>
|
||||
__device__ __forceinline__ void StoreValues(const T src[U], T* dst)
|
||||
{
|
||||
|
@ -81,9 +77,17 @@ __device__ __forceinline__ void StoreValues(const T src[U], T* dst)
|
|||
for (int i = 0; i < U; i++)
|
||||
dst[i] = src[i];
|
||||
}
|
||||
*/
|
||||
template <int U, typename T1, typename T2>
|
||||
__device__ __forceinline__ void StoreValues(const T1 src[U], T2* dst)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 0; i < U; i++)
|
||||
dst[i] = (T2)src[i];
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void StoreValues<2, float>(const float src[2], float* dst)
|
||||
__device__ __forceinline__ void StoreValues<2, float, float>(const float src[2], float* dst)
|
||||
{
|
||||
// dst must be aligned at 8 bytes boundary.
|
||||
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
|
||||
|
@ -94,7 +98,7 @@ __device__ __forceinline__ void StoreValues<2, float>(const float src[2], float*
|
|||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void StoreValues<4, float>(const float src[4], float* dst)
|
||||
__device__ __forceinline__ void StoreValues<4, float, float>(const float src[4], float* dst)
|
||||
{
|
||||
// dst must be aligned at 16 bytes boundary.
|
||||
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
|
||||
|
@ -107,12 +111,16 @@ __device__ __forceinline__ void StoreValues<4, float>(const float src[4], float*
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T Shuffle(T input, int srcLane)
|
||||
__device__ __forceinline__ T Shuffle(T input, int srcLane, unsigned int mask)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
// shfl is supported only on Kepler+
|
||||
static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer.");
|
||||
#if CUDA_VERSION >= 9000
|
||||
return cub::ShuffleIndex(input, srcLane, CUB_PTX_WARP_THREADS, mask); // Need cub > 1.7.0
|
||||
#else
|
||||
return cub::ShuffleIndex(input, srcLane);
|
||||
#endif
|
||||
#else
|
||||
assert(false);
|
||||
return input; // keep compiler happy
|
||||
|
@ -136,6 +144,15 @@ namespace Operations
|
|||
assert(::isfinite(a) && a > 0);
|
||||
return rsqrt(a);
|
||||
}
|
||||
|
||||
__device__ half RSqrt(half a)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 600
|
||||
return hrsqrt(a);
|
||||
#else
|
||||
return __float2half(rsqrtf(__half2float(a)));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// This function is used to select correct unroll factor.
|
||||
|
@ -151,6 +168,17 @@ void Call(size_t vectorSize, Targs... args)
|
|||
Func<1>::template Call<T>(args...);
|
||||
}
|
||||
|
||||
template <template <int> class Func, typename T1, typename T2, typename ...Targs>
|
||||
void Call2(size_t vectorSize, Targs... args)
|
||||
{
|
||||
if ((vectorSize % 4) == 0)
|
||||
Func<4>::template Call<T1, T2>(args...);
|
||||
else if ((vectorSize % 2) == 0)
|
||||
Func<2>::template Call<T1, T2>(args...);
|
||||
else
|
||||
Func<1>::template Call<T1, T2>(args...);
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Mean and variance computation
|
||||
//--------------------------------------------------------------------
|
||||
|
@ -186,15 +214,16 @@ void Call(size_t vectorSize, Targs... args)
|
|||
// 0 < blendFactor < 1 - blend new running mean/var with averaged mean/var of the current minibatch, e.g.,
|
||||
// new xMean = (1 - blendFactor) * actual batch mean + blendFactor * new runMean
|
||||
// blendFactor == 0 - use actual batch mean/var
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
|
||||
__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
||||
const ElemType* x, // (in) input data
|
||||
double expAvgFactor, // TODO why not ElemType? same for the other parameters, functions?
|
||||
double blendFactor,
|
||||
ElemType* runMean, ElemType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
|
||||
StatType* runMean, StatType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
|
||||
double epsilon,
|
||||
ElemType* xMean, ElemType* xInvStdDev) // (out) this minibatch's mean and inverse stddev
|
||||
StatType* xMean, StatType* xInvStdDev) // (out) this minibatch's mean and inverse stddev
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
assert((vectorSize % U) == 0);
|
||||
|
@ -219,9 +248,9 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
|
||||
// first estimate mean over all data for this thread
|
||||
int n = 0;
|
||||
ElemType mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
|
||||
ElemType m2[U]; // likewise for variance
|
||||
ElemType im2[U]; // and inverse stddev
|
||||
comp_t mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
|
||||
comp_t m2[U]; // likewise for variance
|
||||
comp_t im2[U]; // and inverse stddev
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
|
@ -235,13 +264,13 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
for (; icolSrc < batchSize; icolSrc += BlockDimY)
|
||||
{
|
||||
n++;
|
||||
ElemType curVal[U];
|
||||
comp_t curVal[U];
|
||||
LoadValues<U>(psrc, curVal);
|
||||
// No need for separate unrolling, SASS looks good.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ElemType d = curVal[k] - mean[k];
|
||||
comp_t d = curVal[k] - mean[k];
|
||||
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
|
||||
// Using precise math slows down the code by about 40%.
|
||||
mean[k] += d / n; // mean_n = [mean_{n-1} * (n-1) + curVal] / n = mean_{n-1} *n/n - mean_{n-1} / n + curVal / n
|
||||
|
@ -253,6 +282,12 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
// now reduce minibatch mean/variance across threads
|
||||
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
|
||||
const int laneId = tid & 0x1f;
|
||||
|
||||
unsigned int mask;
|
||||
#if CUDA_VERSION >= 9000
|
||||
mask = __ballot_sync(0xffffffff, n);
|
||||
#endif
|
||||
|
||||
// First, reduce within warp using shuffle.
|
||||
if (n > 0)
|
||||
{
|
||||
|
@ -260,16 +295,16 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
for (int i = 1; i < CUB_PTX_WARP_THREADS / BlockDimX; i *= 2)
|
||||
{
|
||||
int srcLane = laneId + BlockDimX * i;
|
||||
int n2 = Shuffle(n, srcLane);
|
||||
int n2 = Shuffle(n, srcLane, mask);
|
||||
int nsum = n + n2;
|
||||
ElemType d[U];
|
||||
comp_t d[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
d[k] = Shuffle(mean[k], srcLane) - mean[k];
|
||||
ElemType dScaled = d[k] * n2 / nsum;
|
||||
d[k] = Shuffle(mean[k], srcLane, mask) - mean[k];
|
||||
comp_t dScaled = d[k] * n2 / nsum;
|
||||
mean[k] += dScaled;
|
||||
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
|
||||
m2[k] += Shuffle(m2[k], srcLane, mask) + d[k] * n * dScaled;
|
||||
}
|
||||
n = nsum;
|
||||
}
|
||||
|
@ -278,8 +313,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
// Storage for each warp in a thread block. First warp ("accumulator") holds
|
||||
// final results so it does not need shared memory.
|
||||
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
|
||||
__shared__ ElemType meanRes[BlockDimX * U][cwarp - 1];
|
||||
__shared__ ElemType m2Res[BlockDimX * U][cwarp - 1];
|
||||
__shared__ comp_t meanRes[BlockDimX * U][cwarp - 1];
|
||||
__shared__ comp_t m2Res[BlockDimX * U][cwarp - 1];
|
||||
__shared__ int nRes[cwarp - 1];
|
||||
|
||||
// Each warp (except warp0) will write accumulated results to shared memory.
|
||||
|
@ -309,12 +344,12 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
{
|
||||
int n2 = nRes[i];
|
||||
int nsum = n + n2;
|
||||
ElemType d[U];
|
||||
comp_t d[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
d[k] = meanRes[threadIdx.x * U + k][i] - mean[k];
|
||||
ElemType dScaled = d[k] * n2 / nsum;
|
||||
comp_t dScaled = d[k] * n2 / nsum;
|
||||
mean[k] += dScaled;
|
||||
m2[k] += m2Res[threadIdx.x * U + k][i] + d[k] * n * dScaled;
|
||||
}
|
||||
|
@ -322,8 +357,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
}
|
||||
|
||||
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
|
||||
ElemType run[U];
|
||||
ElemType x[U];
|
||||
comp_t run[U];
|
||||
comp_t x[U];
|
||||
|
||||
// Compute running mean and batch mean.
|
||||
LoadValues<U>(runMean + idxDstBase, run);
|
||||
|
@ -344,14 +379,14 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
// Compute batch inverse standard deviation and variance
|
||||
ElemType runVariance = batchSize == 1 ? 0 : m2[k] / (batchSize - 1);
|
||||
comp_t runVariance = batchSize == 1 ? 0 : m2[k] / (batchSize - 1);
|
||||
// Average
|
||||
run[k] = expAvgFactor * runVariance + (1.0 - expAvgFactor) * run[k];
|
||||
// Blend
|
||||
im2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
|
||||
im2[k] = Operations::RSqrt(static_cast<comp_t>(m2[k] / batchSize + epsilon));
|
||||
if (blendFactor != 0)
|
||||
{
|
||||
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
|
||||
comp_t runInvStdDev = Operations::RSqrt(static_cast<comp_t>(run[k] + epsilon));
|
||||
im2[k] = blendFactor * runInvStdDev + (1.0 - blendFactor) * im2[k];
|
||||
}
|
||||
}
|
||||
|
@ -363,7 +398,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
else if (threadIdx.y == 0)
|
||||
{
|
||||
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
|
||||
ElemType run[U];
|
||||
comp_t run[U];
|
||||
|
||||
// Copy mean
|
||||
LoadValues<U>(runMean + idxDstBase, run);
|
||||
|
@ -373,7 +408,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
LoadValues<U>(runVariance + idxDstBase, run);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
run[k] = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
|
||||
run[k] = Operations::RSqrt(static_cast<comp_t>(run[k] + epsilon));
|
||||
StoreValues<U>(run, xInvStdDev + idxDstBase);
|
||||
}
|
||||
}
|
||||
|
@ -381,12 +416,13 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
// This kernel is very similar to kComputeBatchMeanAndInvStdDev except it reduces not just over N (minibatch)
|
||||
// but also W and H dimensions.
|
||||
// REVIEW alexeyk: is it possible to combine this and previous kernel into a single kernel without hurting performance/readability much?
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
|
||||
__global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatialSize, int batchSize, const ElemType* x,
|
||||
double expAvgFactor, double blendFactor,
|
||||
ElemType* runMean, ElemType* runVariance,
|
||||
double epsilon, ElemType* xMean, ElemType* xInvStdDev)
|
||||
StatType* runMean, StatType* runVariance,
|
||||
double epsilon, StatType* xMean, StatType* xInvStdDev)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
assert(blockDim.x == BlockDimX);
|
||||
|
@ -410,8 +446,8 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
|
|||
int irowSrcLim = (blockIdx.x + 1) * spatialSize;
|
||||
|
||||
int n = 0;
|
||||
ElemType mean[U];
|
||||
ElemType m2[U];
|
||||
comp_t mean[U];
|
||||
comp_t m2[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
|
@ -429,13 +465,13 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
|
|||
for (int irowSrc = irowSrcBase; irowSrc < irowSrcLim; irowSrc += BlockDimX * U, psrc += BlockDimX * U)
|
||||
{
|
||||
n++;
|
||||
ElemType curVal[U];
|
||||
comp_t curVal[U];
|
||||
LoadValues<U>(psrc, curVal);
|
||||
// No need for separate unrolling, SASS looks good.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ElemType d = curVal[k] - mean[k];
|
||||
comp_t d = curVal[k] - mean[k];
|
||||
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
|
||||
// Using precise math slows down the code by about 40%.
|
||||
mean[k] += d / n;
|
||||
|
@ -448,22 +484,26 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
|
|||
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
|
||||
const int laneId = tid & 0x1f;
|
||||
// First, reduce within warp using shuffle.
|
||||
unsigned int mask;
|
||||
#if CUDA_VERSION >= 9000
|
||||
mask = __ballot_sync(0xffffffff, n);
|
||||
#endif
|
||||
if (n > 0)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 1; i < CUB_PTX_WARP_THREADS; i *= 2)
|
||||
{
|
||||
int srcLane = laneId + i;
|
||||
int n2 = Shuffle(n, srcLane);
|
||||
int n2 = Shuffle(n, srcLane, mask);
|
||||
int nsum = n + n2;
|
||||
ElemType d[U];
|
||||
comp_t d[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
d[k] = Shuffle(mean[k], srcLane) - mean[k];
|
||||
ElemType dScaled = d[k] * n2 / nsum;
|
||||
d[k] = Shuffle(mean[k], srcLane, mask) - mean[k];
|
||||
comp_t dScaled = d[k] * n2 / nsum;
|
||||
mean[k] += dScaled;
|
||||
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
|
||||
m2[k] += Shuffle(m2[k], srcLane, mask) + d[k] * n * dScaled;
|
||||
}
|
||||
n = nsum;
|
||||
}
|
||||
|
@ -472,8 +512,8 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
|
|||
// Storage for each warp in a thread block. First warp ("accumulator") holds
|
||||
// final results so it does not need shared memory.
|
||||
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
|
||||
__shared__ ElemType meanRes[U][cwarp - 1];
|
||||
__shared__ ElemType m2Res[U][cwarp - 1];
|
||||
__shared__ comp_t meanRes[U][cwarp - 1];
|
||||
__shared__ comp_t m2Res[U][cwarp - 1];
|
||||
__shared__ int nRes[cwarp - 1];
|
||||
|
||||
// Each warp (except warp0) will write accumulated results to shared memory.
|
||||
|
@ -499,12 +539,12 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
|
|||
{
|
||||
int n2 = nRes[i];
|
||||
int nsum = n + n2;
|
||||
ElemType d[U];
|
||||
comp_t d[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
d[k] = meanRes[k][i] - mean[k];
|
||||
ElemType dScaled = d[k] * n2 / nsum;
|
||||
comp_t dScaled = d[k] * n2 / nsum;
|
||||
mean[k] += dScaled;
|
||||
m2[k] += m2Res[k][i] + d[k] * n * dScaled;
|
||||
}
|
||||
|
@ -515,8 +555,8 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
|
|||
#pragma unroll
|
||||
for (int k = 1; k < U; k++)
|
||||
{
|
||||
ElemType d = mean[k] - mean[0];
|
||||
ElemType dScaled = d * n / (n + k * n);
|
||||
comp_t d = mean[k] - mean[0];
|
||||
comp_t dScaled = d * n / (n + k * n);
|
||||
mean[0] += dScaled;
|
||||
m2[0] += m2[k] + d * k * n * dScaled;
|
||||
}
|
||||
|
@ -525,12 +565,12 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
|
|||
runMean[blockIdx.x] = expAvgFactor * mean[0] + (1.0 - expAvgFactor) * runMean[blockIdx.x];
|
||||
xMean[blockIdx.x] = blendFactor * runMean[blockIdx.x] + (1.0 - blendFactor) * mean[0];
|
||||
|
||||
ElemType runV = batchSize * spatialSize == 1 ? 0 : m2[0] / (batchSize * spatialSize - 1);
|
||||
comp_t runV = batchSize * spatialSize == 1 ? 0 : m2[0] / (batchSize * spatialSize - 1);
|
||||
runVariance[blockIdx.x] = expAvgFactor * runV + (1.0 - expAvgFactor) * runVariance[blockIdx.x];
|
||||
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
|
||||
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<comp_t>(m2[0] / (batchSize * spatialSize) + (comp_t)epsilon));
|
||||
if (blendFactor != 0)
|
||||
{
|
||||
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
|
||||
comp_t runInvStdDev = Operations::RSqrt(static_cast<comp_t>((comp_t)runVariance[blockIdx.x] + (comp_t)epsilon));
|
||||
xInvStdDev[blockIdx.x] = blendFactor * runInvStdDev + (1.0 - blendFactor) * xInvStdDev[blockIdx.x];
|
||||
}
|
||||
}
|
||||
|
@ -538,7 +578,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
|
|||
else if (threadIdx.y == 0 && threadIdx.x == 0)
|
||||
{
|
||||
xMean[blockIdx.x] = runMean[blockIdx.x];
|
||||
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
|
||||
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<comp_t>((comp_t)runVariance[blockIdx.x] + (comp_t)epsilon));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -547,14 +587,14 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
|
|||
template <int U>
|
||||
struct ComputeBatchMeanAndInvStdDev
|
||||
{
|
||||
template <typename ElemType>
|
||||
template <typename ElemType, typename StatType>
|
||||
static void Call(size_t vectorSize, size_t batchSize,
|
||||
const ElemType* x, // (in) input data
|
||||
double expAvgFactor,
|
||||
double blendFactor,
|
||||
ElemType* runMean, ElemType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
|
||||
StatType* runMean, StatType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
|
||||
double epsilon,
|
||||
ElemType* xMean, ElemType* xInvStdDev, // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
|
||||
StatType* xMean, StatType* xInvStdDev, // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
|
||||
cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
|
@ -565,7 +605,7 @@ struct ComputeBatchMeanAndInvStdDev
|
|||
auto bdim = dim3(BlockDimX, BlockDimY);
|
||||
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
|
||||
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
|
||||
kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
|
||||
kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(batchSize),
|
||||
x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
|
||||
}
|
||||
|
@ -574,10 +614,10 @@ struct ComputeBatchMeanAndInvStdDev
|
|||
template <int U>
|
||||
struct ComputeSpatialBatchMeanAndInvStdDev
|
||||
{
|
||||
template <typename ElemType>
|
||||
template <typename ElemType, typename StatType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x,
|
||||
double expAvgFactor, double blendFactor, ElemType* runMean, ElemType* runVariance,
|
||||
double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
|
||||
double expAvgFactor, double blendFactor, StatType* runMean, StatType* runVariance,
|
||||
double epsilon, StatType* xMean, StatType* xInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % spatialSize) == 0);
|
||||
assert((spatialSize % U) == 0);
|
||||
|
@ -589,7 +629,7 @@ struct ComputeSpatialBatchMeanAndInvStdDev
|
|||
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
|
||||
// Each thread block processes a single whole feature map independently (i.e. reduces over W, H and N dimensions).
|
||||
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
|
||||
kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
|
||||
kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize),
|
||||
x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
|
||||
}
|
||||
|
@ -603,14 +643,15 @@ struct ComputeSpatialBatchMeanAndInvStdDev
|
|||
// or Cx1x1 in convolutional case.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
template <int BlockDimX, int BlockDimY, bool Spatial, bool NormalizeRunningStats, int U, typename ElemType>
|
||||
template <int BlockDimX, int BlockDimY, bool Spatial, bool NormalizeRunningStats, int U, typename ElemType, typename StatType>
|
||||
__global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int batchSize,
|
||||
double epsilon,
|
||||
const ElemType* x, ElemType* y,
|
||||
const ElemType* bnScale, const ElemType* bnBias,
|
||||
const ElemType* runningMean, const ElemType* runningVariance,
|
||||
const ElemType* batchMean, ElemType* batchInvStdDev)
|
||||
const StatType* bnScale, const StatType* bnBias,
|
||||
const StatType* runningMean, const StatType* runningVariance,
|
||||
const StatType* batchMean, StatType* batchInvStdDev)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
assert(blockDim.x == BlockDimX);
|
||||
|
@ -627,10 +668,10 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
|
|||
return;
|
||||
assert(irowBase + U <= vectorSize);
|
||||
|
||||
__shared__ ElemType meanS[BlockDimX * U];
|
||||
__shared__ ElemType invStdDevS[BlockDimX * U];
|
||||
__shared__ ElemType scaleS[BlockDimX * U];
|
||||
__shared__ ElemType biasS[BlockDimX * U];
|
||||
__shared__ comp_t meanS[BlockDimX * U];
|
||||
__shared__ comp_t invStdDevS[BlockDimX * U];
|
||||
__shared__ comp_t scaleS[BlockDimX * U];
|
||||
__shared__ comp_t biasS[BlockDimX * U];
|
||||
int offs = threadIdx.x * U;
|
||||
|
||||
// REVIEW alexeyk: optimize smem usage, reduce transaction count (is it worth it?).
|
||||
|
@ -644,8 +685,8 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
|
|||
int imap = (irowBase + k) / spatialSize;
|
||||
meanS[offs + k] = NormalizeRunningStats ? runningMean[imap] : batchMean[imap];
|
||||
invStdDevS[offs + k] = NormalizeRunningStats
|
||||
? Operations::RSqrt(static_cast<ElemType>(runningVariance[imap] + epsilon))
|
||||
: batchInvStdDev[imap];
|
||||
? Operations::RSqrt(static_cast<comp_t>((comp_t)runningVariance[imap] + (comp_t)epsilon))
|
||||
: (comp_t)batchInvStdDev[imap];
|
||||
scaleS[offs + k] = bnScale[imap];
|
||||
biasS[offs + k] = bnBias[imap];
|
||||
}
|
||||
|
@ -657,18 +698,18 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
|
|||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
invStdDevS[offs + k] = NormalizeRunningStats
|
||||
? Operations::RSqrt(static_cast<ElemType>(runningVariance[irowBase + k] + epsilon))
|
||||
: batchInvStdDev[irowBase + k];
|
||||
? Operations::RSqrt(static_cast<comp_t>((comp_t)runningVariance[irowBase + k] + (comp_t)epsilon))
|
||||
: (comp_t)batchInvStdDev[irowBase + k];
|
||||
}
|
||||
LoadValues<U>(bnScale + irowBase, scaleS + offs);
|
||||
LoadValues<U>(bnBias + irowBase, biasS + offs);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
ElemType mean[U];
|
||||
ElemType invStdDev[U];
|
||||
ElemType scale[U];
|
||||
ElemType bias[U];
|
||||
comp_t mean[U];
|
||||
comp_t invStdDev[U];
|
||||
comp_t scale[U];
|
||||
comp_t bias[U];
|
||||
LoadValues<U>(meanS + offs, mean);
|
||||
LoadValues<U>(invStdDevS + offs, invStdDev);
|
||||
LoadValues<U>(scaleS + offs, scale);
|
||||
|
@ -681,7 +722,7 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
|
|||
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
|
||||
for (; icol < batchSize; icol += gridDim.y * BlockDimY, psrc += stride, pdst += stride)
|
||||
{
|
||||
ElemType val[U];
|
||||
comp_t val[U];
|
||||
LoadValues<U>(psrc, val);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
|
@ -695,13 +736,13 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
|
|||
template <int U>
|
||||
struct NormalizeBatchTraining
|
||||
{
|
||||
template <typename ElemType>
|
||||
template <typename ElemType, typename StatType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial,
|
||||
bool normalizeRunningStats, double epsilon,
|
||||
const ElemType* x, ElemType* y, // (in, out) data to normalize -> normalized data
|
||||
const ElemType* bnScale, const ElemType* bnBias, // (in) scale/bias to denormalize with
|
||||
const ElemType* runningMean, const ElemType* runningVariance, // (in) running mean/variance
|
||||
const ElemType* batchMean, ElemType* batchInvStdDev, // (in) batch mean/stddev to normalize with
|
||||
const StatType* bnScale, const StatType* bnBias, // (in) scale/bias to denormalize with
|
||||
const StatType* runningMean, const StatType* runningVariance, // (in) running mean/variance
|
||||
const StatType* batchMean, StatType* batchInvStdDev, // (in) batch mean/stddev to normalize with
|
||||
cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
|
@ -715,14 +756,14 @@ struct NormalizeBatchTraining
|
|||
if (spatial)
|
||||
{
|
||||
if (normalizeRunningStats)
|
||||
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, true, U><<<gdim, bdim, 0, stream>>>(
|
||||
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, true, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
|
||||
(int)vectorSize, (int)spatialSize, (int)batchSize,
|
||||
epsilon,
|
||||
x, y, bnScale, bnBias,
|
||||
runningMean, runningVariance,
|
||||
batchMean, batchInvStdDev);
|
||||
else
|
||||
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, false, U><<<gdim, bdim, 0, stream>>>(
|
||||
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, false, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
|
||||
(int)vectorSize, (int)spatialSize, (int)batchSize,
|
||||
epsilon,
|
||||
x, y, bnScale, bnBias,
|
||||
|
@ -732,14 +773,14 @@ struct NormalizeBatchTraining
|
|||
else
|
||||
{
|
||||
if (normalizeRunningStats)
|
||||
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, true, U><<<gdim, bdim, 0, stream>>>(
|
||||
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, true, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
|
||||
(int)vectorSize, (int)spatialSize, (int)batchSize,
|
||||
epsilon,
|
||||
x, y, bnScale, bnBias,
|
||||
runningMean, runningVariance,
|
||||
batchMean, batchInvStdDev);
|
||||
else
|
||||
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, false, U><<<gdim, bdim, 0, stream>>>(
|
||||
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, false, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
|
||||
(int)vectorSize, (int)spatialSize, (int)batchSize,
|
||||
epsilon,
|
||||
x, y, bnScale, bnBias,
|
||||
|
@ -757,10 +798,11 @@ struct NormalizeBatchTraining
|
|||
// All tensor dimensions and assumptions are the same as in case of forward propagation.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
|
||||
__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
|
||||
const ElemType* savedMean, const ElemType* savedInvStdDev)
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
|
||||
__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, StatType* dScale, StatType* dBias,
|
||||
const StatType* savedMean, const StatType* savedInvStdDev)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
static_assert(((BlockDimY - 1) & BlockDimY) == 0, "BlockDimY must be a power of 2.");
|
||||
|
@ -777,10 +819,10 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
|
|||
return;
|
||||
assert(irowSrcBase + U <= vectorSize);
|
||||
|
||||
ElemType mean[U];
|
||||
ElemType invStdDev[U];
|
||||
__shared__ ElemType meanS[BlockDimX * U];
|
||||
__shared__ ElemType invStdDevS[BlockDimX * U];
|
||||
comp_t mean[U];
|
||||
comp_t invStdDev[U];
|
||||
__shared__ comp_t meanS[BlockDimX * U];
|
||||
__shared__ comp_t invStdDevS[BlockDimX * U];
|
||||
// Read mean and inv std dev.
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
|
@ -796,8 +838,8 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
|
|||
LoadValues<U>(&invStdDevS[threadIdx.x * U], invStdDev);
|
||||
}
|
||||
|
||||
ElemType ds[U];
|
||||
ElemType db[U];
|
||||
comp_t ds[U];
|
||||
comp_t db[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
|
@ -813,21 +855,21 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
|
|||
// Stride over all vectors in the batch.
|
||||
for (; icolSrc < batchSize; icolSrc += BlockDimY, px += stride, pdy += stride)
|
||||
{
|
||||
ElemType curX[U];
|
||||
ElemType curdY[U];
|
||||
comp_t curX[U];
|
||||
comp_t curdY[U];
|
||||
LoadValues<U>(px, curX);
|
||||
LoadValues<U>(pdy, curdY);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ds[k] += pdy[k] * (curX[k] - mean[k]) * invStdDev[k];
|
||||
db[k] += pdy[k];
|
||||
ds[k] += (comp_t)pdy[k] * (curX[k] - mean[k]) * invStdDev[k];
|
||||
db[k] += (comp_t)pdy[k];
|
||||
}
|
||||
}
|
||||
|
||||
// Final reduction.
|
||||
__shared__ ElemType dsS[BlockDimY][BlockDimX * U];
|
||||
__shared__ ElemType dbS[BlockDimY][BlockDimX * U];
|
||||
__shared__ comp_t dsS[BlockDimY][BlockDimX * U];
|
||||
__shared__ comp_t dbS[BlockDimY][BlockDimX * U];
|
||||
StoreValues<U>(ds, &dsS[threadIdx.y][threadIdx.x * U]);
|
||||
StoreValues<U>(db, &dbS[threadIdx.y][threadIdx.x * U]);
|
||||
__syncthreads();
|
||||
|
@ -861,10 +903,11 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
|
|||
}
|
||||
}
|
||||
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
|
||||
__global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev)
|
||||
StatType* dScale, StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
assert(blockDim.x == BlockDimX);
|
||||
|
@ -881,10 +924,10 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
|
|||
assert(irowBase + U <= vectorSize);
|
||||
int irowLim = (blockIdx.x + 1) * spatialSize;
|
||||
|
||||
ElemType mean;
|
||||
ElemType invStdDev;
|
||||
__shared__ ElemType meanS;
|
||||
__shared__ ElemType invStdDevS;
|
||||
comp_t mean;
|
||||
comp_t invStdDev;
|
||||
__shared__ comp_t meanS;
|
||||
__shared__ comp_t invStdDevS;
|
||||
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
|
||||
// Read mean and inv std dev.
|
||||
if (tid == 0)
|
||||
|
@ -899,8 +942,8 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
|
|||
invStdDev = invStdDevS;
|
||||
}
|
||||
|
||||
ElemType ds[U];
|
||||
ElemType db[U];
|
||||
comp_t ds[U];
|
||||
comp_t db[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
|
@ -921,25 +964,25 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
|
|||
// Stride over all values in feature map (W and H dimensions).
|
||||
for (int irow = irowBase; irow < irowLim; irow += BlockDimX * U, px += BlockDimX * U, pdy += BlockDimX * U)
|
||||
{
|
||||
ElemType curX[U];
|
||||
ElemType curdY[U];
|
||||
comp_t curX[U];
|
||||
comp_t curdY[U];
|
||||
LoadValues<U>(px, curX);
|
||||
LoadValues<U>(pdy, curdY);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ds[k] += pdy[k] * (curX[k] - mean) * invStdDev;
|
||||
db[k] += pdy[k];
|
||||
ds[k] += (comp_t)pdy[k] * (curX[k] - mean) * invStdDev;
|
||||
db[k] += (comp_t)pdy[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
using BlockReduce = cub::BlockReduce<ElemType, BlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BlockDimY>;
|
||||
using BlockReduce = cub::BlockReduce<comp_t, BlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BlockDimY>;
|
||||
// Note: must use separate temp storages for each reduction.
|
||||
__shared__ typename BlockReduce::TempStorage tmp1;
|
||||
ElemType dsRes = BlockReduce(tmp1).Sum(ds);
|
||||
comp_t dsRes = BlockReduce(tmp1).Sum(ds);
|
||||
__shared__ typename BlockReduce::TempStorage tmp2;
|
||||
ElemType dbRes = BlockReduce(tmp2).Sum(db);
|
||||
comp_t dbRes = BlockReduce(tmp2).Sum(db);
|
||||
if (tid == 0)
|
||||
{
|
||||
dScale[blockIdx.x] = dsRes;
|
||||
|
@ -950,9 +993,9 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
|
|||
template <int U>
|
||||
struct ComputeScaleAndBiasGradients
|
||||
{
|
||||
template <typename ElemType>
|
||||
template <typename ElemType, typename StatType>
|
||||
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
|
||||
StatType* dScale, StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
assert(batchSize >= 1);
|
||||
|
@ -961,7 +1004,7 @@ struct ComputeScaleAndBiasGradients
|
|||
auto bdim = dim3(BlockDimX, BlockDimY);
|
||||
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
|
||||
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
|
||||
kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
|
||||
kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
|
||||
}
|
||||
};
|
||||
|
@ -969,9 +1012,9 @@ struct ComputeScaleAndBiasGradients
|
|||
template <int U>
|
||||
struct ComputeSpatialScaleAndBiasGradients
|
||||
{
|
||||
template <typename ElemType>
|
||||
template <typename ElemType, typename StatType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
|
||||
StatType* dScale, StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((spatialSize % U) == 0);
|
||||
assert((vectorSize % spatialSize) == 0);
|
||||
|
@ -982,17 +1025,18 @@ struct ComputeSpatialScaleAndBiasGradients
|
|||
auto bdim = dim3(BlockDimX, BlockDimY);
|
||||
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
|
||||
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
|
||||
kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
|
||||
kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
|
||||
}
|
||||
};
|
||||
|
||||
// mbStatsWeight is the weight with which current MB's stats were used (0 means not at all, locked model).
|
||||
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
|
||||
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType, typename StatType>
|
||||
__global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
|
||||
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
|
||||
const ElemType* savedMean, const ElemType* savedInvStdDev)
|
||||
const StatType* bnScale, StatType mbStatsWeight, const StatType* dScale, const StatType* dBias,
|
||||
const StatType* savedMean, const StatType* savedInvStdDev)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
assert(blockDim.x == BlockDimX);
|
||||
|
@ -1008,11 +1052,11 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
|
|||
if (irowBase >= vectorSize)
|
||||
return;
|
||||
assert(irowBase + U <= vectorSize);
|
||||
ElemType scale[U];
|
||||
ElemType ds[U];
|
||||
ElemType db[U];
|
||||
ElemType mean[U];
|
||||
ElemType invStdDev[U];
|
||||
comp_t scale[U];
|
||||
comp_t ds[U];
|
||||
comp_t db[U];
|
||||
comp_t mean[U];
|
||||
comp_t invStdDev[U];
|
||||
// REVIEW alexeyk: here we're wasting some bandwidth but this might be ok as it's a one-timer.
|
||||
if (Spatial)
|
||||
{
|
||||
|
@ -1044,9 +1088,9 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
|
|||
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
|
||||
for (; icol < batchSize; icol += gridDim.y * BlockDimY, px += stride, pdy += stride, pdx += stride)
|
||||
{
|
||||
ElemType xCur[U];
|
||||
ElemType dyCur[U];
|
||||
ElemType dxCur[U];
|
||||
comp_t xCur[U];
|
||||
comp_t dyCur[U];
|
||||
comp_t dxCur[U];
|
||||
LoadValues<U>(px, xCur);
|
||||
LoadValues<U>(pdy, dyCur);
|
||||
LoadValues<U>(pdx, dxCur);
|
||||
|
@ -1060,12 +1104,12 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
|
|||
// dBias = Reduce(dy)
|
||||
// dScale = Reduce(dy * xHat)
|
||||
// Simplifying this a bit more, we get the formula below.
|
||||
ElemType val[U];
|
||||
comp_t val[U];
|
||||
int m = Spatial ? batchSize * spatialSize : batchSize;
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k]; // xHat
|
||||
comp_t xNorm = (xCur[k] - mean[k]) * invStdDev[k]; // xHat
|
||||
// scale * invStdDev * (
|
||||
// dL/dyi
|
||||
// - mbStatsWeight * (xHat * dL/dScale + dL/dBias) / m
|
||||
|
@ -1073,7 +1117,7 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
|
|||
val[k] = dxCur[k] // (adding to gradient)
|
||||
+ (scale[k] * invStdDev[k]) * (
|
||||
dyCur[k]
|
||||
- mbStatsWeight * (xNorm * ds[k] + db[k]) / m);
|
||||
- (comp_t)mbStatsWeight * (xNorm * ds[k] + db[k]) / m);
|
||||
}
|
||||
StoreValues<U>(val, pdx);
|
||||
}
|
||||
|
@ -1082,10 +1126,10 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
|
|||
template <int U>
|
||||
struct BackpropagateBatchNormGradients
|
||||
{
|
||||
template <typename ElemType>
|
||||
template <typename ElemType, typename StatType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
|
||||
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
|
||||
const ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
|
||||
const StatType* bnScale, StatType mbStatsWeight, const StatType* dScale,
|
||||
const StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
assert(batchSize >= 1);
|
||||
|
@ -1096,7 +1140,7 @@ struct BackpropagateBatchNormGradients
|
|||
static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
|
||||
if (spatial)
|
||||
{
|
||||
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
|
||||
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
|
||||
}
|
||||
else
|
||||
|
|
|
@ -51,6 +51,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
MATH_API void SetMathLibTraceLevel(int traceLevel);
|
||||
MATH_API int GetMathLibTraceLevel();
|
||||
|
||||
inline bool IsGpu(DEVICEID_TYPE deviceId)
|
||||
{
|
||||
return deviceId > CPUDEVICE;
|
||||
}
|
||||
|
||||
class MATH_API TracingGPUMemoryAllocator
|
||||
{
|
||||
private:
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
|
@ -8,6 +9,7 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
#include <math_constants.h>
|
||||
#include "half.hpp"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -28,6 +30,7 @@ __global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__
|
|||
const ElemType* __restrict__ src, int srcVecSize,
|
||||
ElemType* dst, int dstVecSize)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
int row = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row >= dstVecSize)
|
||||
return;
|
||||
|
@ -41,7 +44,7 @@ __global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__
|
|||
int ivBase = mpRowIwht[row];
|
||||
assert(0 <= colBase && colBase < srcVecSize);
|
||||
|
||||
ElemType sum = 0;
|
||||
comp_t sum = 0;
|
||||
int i0 = mpRowRun[row];
|
||||
int skip = runs[i0++];
|
||||
int size = runs[i0++];
|
||||
|
@ -52,7 +55,7 @@ __global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__
|
|||
continue;
|
||||
int dcol = runs[i0 + i];
|
||||
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
|
||||
sum += kernel[ivBase + skip + i] * src[colBase + dcol];
|
||||
sum += (comp_t)kernel[ivBase + skip + i] * (comp_t)src[colBase + dcol];
|
||||
}
|
||||
dst[row] = sum;
|
||||
|
||||
|
@ -68,6 +71,7 @@ __global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restri
|
|||
const ElemType* __restrict__ srcGrad, int srcVecSize,
|
||||
ElemType* grad, int dstVecSize)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
int row = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row >= srcVecSize)
|
||||
return;
|
||||
|
@ -81,7 +85,7 @@ __global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restri
|
|||
int ivBase = mpRowIwht[row];
|
||||
assert(0 <= colBase && colBase < dstVecSize);
|
||||
|
||||
ElemType g = srcGrad[row];
|
||||
comp_t g = srcGrad[row];
|
||||
int i0 = mpRowRun[row];
|
||||
int skip = runs[i0++];
|
||||
int size = runs[i0++];
|
||||
|
@ -92,7 +96,7 @@ __global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restri
|
|||
continue;
|
||||
int dcol = runs[i0 + i];
|
||||
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
|
||||
atomicAdd(&grad[colBase + dcol], g * kernel[ivBase + skip + i]);
|
||||
atomicAdd(&grad[colBase + dcol], (ElemType)((comp_t)g * (comp_t)kernel[ivBase + skip + i]));
|
||||
}
|
||||
|
||||
srcGrad += blockDim.y * srcVecSize;
|
||||
|
@ -108,6 +112,7 @@ __global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int out
|
|||
const ElemType* __restrict__ srcGrad,
|
||||
ElemType* kernelGrad)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
int row = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row >= outVecSize)
|
||||
return;
|
||||
|
@ -121,7 +126,7 @@ __global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int out
|
|||
int ivBase = mpRowIwht[row];
|
||||
assert(0 <= colBase && colBase < inVecSize);
|
||||
|
||||
ElemType g = srcGrad[row];
|
||||
comp_t g = srcGrad[row];
|
||||
int i0 = mpRowRun[row];
|
||||
int skip = runs[i0++];
|
||||
int size = runs[i0++];
|
||||
|
@ -132,7 +137,7 @@ __global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int out
|
|||
continue;
|
||||
int dcol = runs[i0 + i];
|
||||
assert(0 <= colBase + dcol && colBase + dcol < inVecSize);
|
||||
atomicAdd(&kernelGrad[ivBase + skip + i], g * in[colBase + dcol]);
|
||||
atomicAdd(&kernelGrad[ivBase + skip + i], (ElemType)((comp_t)g * (comp_t)in[colBase + dcol]));
|
||||
}
|
||||
|
||||
in += blockDim.y * inVecSize;
|
||||
|
@ -245,6 +250,7 @@ __global__ void kMaxROIPoolingForward(const int totalIterations,
|
|||
const int pooledWidth, const int pooledHeight, const ElemType* src,
|
||||
const ElemType* roiData, ElemType* dst, ElemType* argmax, double spatialScale)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
// index loops over all totalRois*c*pooledHeight*pooledWidth output locations.
|
||||
for (int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
index < (totalIterations); index += blockDim.x * gridDim.x)
|
||||
|
@ -269,8 +275,8 @@ __global__ void kMaxROIPoolingForward(const int totalIterations,
|
|||
int roiWidth = max(roiEndW - roiStartW + 1, (int)1);
|
||||
int roiHeight = max(roiEndH - roiStartH + 1, (int)1);
|
||||
|
||||
ElemType winH = (ElemType)roiHeight / (ElemType)pooledHeight;
|
||||
ElemType winW = (ElemType)roiWidth / (ElemType)pooledWidth;
|
||||
comp_t winH = (comp_t)roiHeight / (comp_t)pooledHeight;
|
||||
comp_t winW = (comp_t)roiWidth / (comp_t)pooledWidth;
|
||||
|
||||
// compute window for this output location.
|
||||
int hstart = (int)(ph * winH);
|
||||
|
@ -286,7 +292,7 @@ __global__ void kMaxROIPoolingForward(const int totalIterations,
|
|||
|
||||
bool isempty = (hend <= hstart) || (wend <= wstart);
|
||||
// Define an empty pooling region to be zero
|
||||
ElemType maxval = isempty ? (ElemType)0 : -CUDART_INF_F;
|
||||
comp_t maxval = isempty ? (comp_t)0 : (comp_t)-CUDART_INF_F;
|
||||
int maxidx = -1;
|
||||
|
||||
int imgIdx = n / numROIs;
|
||||
|
@ -319,6 +325,7 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
|
|||
const int pooledWidth, const int pooledHeight, const ElemType* pooledGrad,
|
||||
const ElemType* roiData, ElemType* grad, const ElemType* argmax, double spatialScale)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
// index loops over all input locations (locations in the original input tensor).
|
||||
for (int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
index < (totalIterations); index += blockDim.x * gridDim.x)
|
||||
|
@ -334,7 +341,7 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
|
|||
int roiMin = n * numROIs;
|
||||
int roiMax = (n + 1) * numROIs;
|
||||
|
||||
ElemType gradient = 0;
|
||||
comp_t gradient = 0;
|
||||
for (int roiN = roiMin; roiN < roiMax; roiN++)
|
||||
{
|
||||
// each ROI is 4 elements: (x, y, w, h)
|
||||
|
@ -355,8 +362,8 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
|
|||
if (!inROI)
|
||||
continue;
|
||||
|
||||
ElemType winH = (ElemType)roiHeight / (ElemType)pooledHeight;
|
||||
ElemType winW = (ElemType)roiWidth / (ElemType)pooledWidth;
|
||||
comp_t winH = (comp_t)roiHeight / (comp_t)pooledHeight;
|
||||
comp_t winW = (comp_t)roiWidth / (comp_t)pooledWidth;
|
||||
|
||||
// what pooled nodes in the output for this ROI could have pooled this input location?
|
||||
// we use int here since the computation can yield a negative result
|
||||
|
@ -381,13 +388,13 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
|
|||
{
|
||||
if ((int)offsetArgmax[ph * pooledWidth + pw] == (h * width + w))
|
||||
{
|
||||
gradient += offsetPoolGrad[ph * pooledWidth + pw];
|
||||
gradient += (comp_t)offsetPoolGrad[ph * pooledWidth + pw];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
atomicAdd(&grad[index], gradient);
|
||||
atomicAdd(&grad[index], (ElemType)gradient);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -443,6 +450,7 @@ __global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const
|
|||
const ElemType* __restrict__ src, int srcVecSize,
|
||||
ElemType* dst, int dstVecSize)
|
||||
{
|
||||
typedef typename TypeSelector<ElemType>::comp_t comp_t;
|
||||
int row = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row >= dstVecSize)
|
||||
return;
|
||||
|
@ -457,14 +465,14 @@ __global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const
|
|||
|
||||
int i0 = mpRowIndices[row];
|
||||
int size = indices[i0++];
|
||||
ElemType sum = 0;
|
||||
comp_t sum = 0;
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
int dcol = indices[i0 + i];
|
||||
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
|
||||
sum += src[colBase + dcol];
|
||||
sum += (comp_t)src[colBase + dcol];
|
||||
}
|
||||
dst[row] = sum / size;
|
||||
dst[row] = sum / (comp_t)size;
|
||||
|
||||
src += blockDim.y * srcVecSize;
|
||||
dst += blockDim.y * dstVecSize;
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
#include "stdafx.h"
|
||||
#include "ConvolutionEngine.h"
|
||||
#include "CuDnnFactories.h"
|
||||
#include "Mkl2017DnnCommon.h"
|
||||
#include "MklDnnCommon.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -239,12 +239,6 @@ protected:
|
|||
out.MaxUnpooling(m_mpRowCol, *m_mpRowIndices, *m_indices, poolIn, in);
|
||||
}
|
||||
|
||||
protected:
|
||||
static bool IsGpu(DEVICEID_TYPE deviceId)
|
||||
{
|
||||
return deviceId >= 0;
|
||||
}
|
||||
|
||||
protected:
|
||||
using IntMatPtr = std::unique_ptr<Matrix<int>>;
|
||||
// IMP NOTE: Make sure that in the declaration below m_isConvGeometryComputed is declared
|
||||
|
@ -579,8 +573,6 @@ public:
|
|||
protected:
|
||||
using typename Base::IntMatPtr;
|
||||
|
||||
using Base::IsGpu;
|
||||
|
||||
using Base::m_geometry;
|
||||
using Base::m_deviceId;
|
||||
using Base::m_imageLayout;
|
||||
|
@ -955,6 +947,11 @@ protected:
|
|||
|
||||
bool Supported(const ConvolveGeometry* geometry, bool forward)
|
||||
{
|
||||
//TODO: test code for linking with mkldnn.dll, will extend to support dilated convolution with MKL-DNN later
|
||||
mkldnn_primitive_attr_t attr;
|
||||
mkldnn_primitive_attr_create(&attr);
|
||||
mkldnn_primitive_attr_destroy(attr);
|
||||
|
||||
//MKL2017 does not support asymmetric padding yet
|
||||
if (geometry->IsAsymmetricPadding()) return false;
|
||||
|
||||
|
@ -1177,7 +1174,41 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
|
|||
return std::make_unique<ReferenceConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, poolIncludePad);
|
||||
}
|
||||
|
||||
// only GPU supports fp16 convolution
|
||||
template <>
|
||||
std::unique_ptr<ConvolutionEngine<half>> ConvolutionEngine<half>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
|
||||
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
|
||||
ConvolutionEngineKind enabledEngines, std::wstring logPrefix,
|
||||
bool forceDeterministicAlgorithms, bool poolIncludePad,
|
||||
bool inputHasFreeDimension)
|
||||
{
|
||||
if (!logPrefix.empty())
|
||||
logPrefix += L": ";
|
||||
|
||||
auto isEnabled = [=](ConvolutionEngineKind eng) { return ((int)enabledEngines & (int)eng) != 0; };
|
||||
// Note: in some cases do not throw exception even if parameters do not match as Create
|
||||
// can be called from places like MEL with default parameters and never be used.
|
||||
// The check will be done later in engine's EnsureCompatible call if the egnine is actually used.
|
||||
auto engStr = (std::string)(*geometry);
|
||||
|
||||
// Check if we can use cuDNN engine. Do not need to validate tensors as ConvolveGeometry has already done that.
|
||||
if (isEnabled(ConvolutionEngineKind::CuDnn) &&
|
||||
CuDnnConvolutionEngineFactory<half>::IsSupported(deviceId, geometry, poolKind))
|
||||
{
|
||||
if (GetMathLibTraceLevel() > 0)
|
||||
fprintf(stderr, "%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
|
||||
|
||||
return CuDnnConvolutionEngineFactory<half>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind,
|
||||
forceDeterministicAlgorithms, poolIncludePad, inputHasFreeDimension);
|
||||
}
|
||||
|
||||
RuntimeError("FP16 convolution is only supported via cuDNN.");
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template class ConvolutionEngine<float>;
|
||||
template class ConvolutionEngine<double>;
|
||||
template class ConvolutionEngine<half>;
|
||||
|
||||
}}}
|
||||
|
|
|
@ -11,20 +11,21 @@
|
|||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <class ElemType>
|
||||
class CuDnnBatchNormEngine : public BatchNormEngine<ElemType>
|
||||
template <class InoutType, class StatType>
|
||||
class CuDnnBatchNormEngine : public BatchNormEngine<InoutType, StatType>
|
||||
{
|
||||
public:
|
||||
using Base = BatchNormEngine<ElemType>;
|
||||
using typename Base::Mat;
|
||||
using Base = BatchNormEngine<InoutType, StatType>;
|
||||
using typename Base::InoutMat;
|
||||
using typename Base::StatMat;
|
||||
|
||||
public:
|
||||
CuDnnBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout)
|
||||
: Base(deviceId, inOutT, spatial, imageLayout),
|
||||
m_cudnn(CuDnn::Instance()),
|
||||
m_inOutCuDnnT(GetInOutTensor(inOutT), CuDnnTensor::GetDataType<ElemType>()),
|
||||
m_scaleBiasCuDnnT(GetScaleBiasTensor(inOutT, spatial), CuDnnTensor::GetDataType<ElemType>()),
|
||||
m_inOutCuDnnT(GetInOutTensor(inOutT), CuDnnTensor::GetDataType<InoutType>()),
|
||||
m_scaleBiasCuDnnT(GetScaleBiasTensor(inOutT, spatial), CuDnnTensor::GetDataType<StatType>()),
|
||||
m_cudnnEpsilon(CUDNN_BN_MIN_EPSILON)
|
||||
{
|
||||
}
|
||||
|
@ -43,8 +44,8 @@ protected:
|
|||
InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions.");
|
||||
}
|
||||
|
||||
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
|
||||
void ForwardCore(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
|
||||
InoutMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev) override
|
||||
{
|
||||
// TODO batchSize == 1
|
||||
|
||||
|
@ -53,7 +54,8 @@ protected:
|
|||
InvalidArgument("cuDNN batch normalization engine currently supports blendTimeConstant of 0 or 1 only.");
|
||||
|
||||
m_inOutCuDnnT.UpdateBatchSize(in.GetNumCols());
|
||||
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL_PERSISTENT : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
if (inferenceOnly) mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
// cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
|
||||
m_cudnnEpsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
|
||||
if (inferenceOnly)
|
||||
|
@ -75,23 +77,26 @@ protected:
|
|||
}
|
||||
}
|
||||
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) override
|
||||
void BackwardCore(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& savedMean, const StatMat& savedInvStdDev,
|
||||
StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) override
|
||||
{
|
||||
UNUSED(blendFactor); // BUGBUG: It should be used.
|
||||
m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
|
||||
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL_PERSISTENT : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
// REVIEW alexeyk: change betaParamDiff to 1 and update CNTK BN engine.
|
||||
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, accumulateDataGrad ? &C::One : &C::Zero, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), m_cudnnEpsilon, ptr(savedMean), ptr(savedInvStdDev)));
|
||||
}
|
||||
|
||||
private:
|
||||
static ElemType* ptr(Mat& src)
|
||||
template<typename ElemType>
|
||||
static ElemType* ptr(Matrix<ElemType>& src)
|
||||
{
|
||||
return src.Data();
|
||||
}
|
||||
static const ElemType* ptr(const Mat& src)
|
||||
|
||||
template<typename ElemType>
|
||||
static const ElemType* ptr(const Matrix<ElemType>& src)
|
||||
{
|
||||
return src.Data();
|
||||
}
|
||||
|
@ -123,7 +128,7 @@ private:
|
|||
}
|
||||
|
||||
private:
|
||||
using C = Consts<ElemType>;
|
||||
using C = Consts<StatType>;
|
||||
|
||||
CuDnn::ptr_t m_cudnn;
|
||||
CuDnnTensor m_inOutCuDnnT;
|
||||
|
@ -131,18 +136,20 @@ private:
|
|||
double m_cudnnEpsilon;
|
||||
};
|
||||
|
||||
template class CuDnnBatchNormEngine<float>;
|
||||
template class CuDnnBatchNormEngine<double>;
|
||||
template class CuDnnBatchNormEngine<float, float>;
|
||||
template class CuDnnBatchNormEngine<double, double>;
|
||||
template class CuDnnBatchNormEngine<half, float>;
|
||||
|
||||
template <typename ElemType>
|
||||
std::unique_ptr<BatchNormEngine<ElemType>> CuDnnBatchNormEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
template <typename InoutType, typename StatType>
|
||||
std::unique_ptr<BatchNormEngine<InoutType, StatType>> CuDnnBatchNormEngineFactory<InoutType, StatType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout)
|
||||
{
|
||||
return std::make_unique<CuDnnBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
|
||||
return std::make_unique<CuDnnBatchNormEngine<InoutType, StatType>>(deviceId, inOutT, spatial, imageLayout);
|
||||
}
|
||||
|
||||
template class CuDnnBatchNormEngineFactory<float>;
|
||||
template class CuDnnBatchNormEngineFactory<double>;
|
||||
template class CuDnnBatchNormEngineFactory<float, float>;
|
||||
template class CuDnnBatchNormEngineFactory<double, double>;
|
||||
template class CuDnnBatchNormEngineFactory<half, float>;
|
||||
|
||||
CudaTimer::~CudaTimer()
|
||||
{
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include "stdafx.h"
|
||||
#include "GPUMatrix.h"
|
||||
#include "CuDnnCommon.h"
|
||||
#include "half.hpp"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
#ifndef CPUONLY
|
||||
|
@ -23,6 +24,10 @@ const float Consts<float>::Zero = 0;
|
|||
template <>
|
||||
const double Consts<double>::Zero = 0;
|
||||
|
||||
const float Consts<half>::Zero = 0;
|
||||
const float Consts<half>::One = 1;
|
||||
|
||||
|
||||
CuDnnTensor::CuDnnTensor()
|
||||
: m_tensor(nullptr)
|
||||
{
|
||||
|
@ -85,12 +90,15 @@ cudnnDataType_t CuDnnTensor::GetDataType()
|
|||
return CUDNN_DATA_FLOAT;
|
||||
else if (typeid(ElemType) == typeid(double))
|
||||
return CUDNN_DATA_DOUBLE;
|
||||
else if (typeid(ElemType) == typeid(half))
|
||||
return CUDNN_DATA_HALF;
|
||||
else
|
||||
InvalidArgument("cuDNN engine currently supports only single and double precision data types.");
|
||||
}
|
||||
|
||||
template cudnnDataType_t CuDnnTensor::GetDataType<float>();
|
||||
template cudnnDataType_t CuDnnTensor::GetDataType<double>();
|
||||
template cudnnDataType_t CuDnnTensor::GetDataType<half>();
|
||||
|
||||
CuDnn::ptr_t CuDnn::Instance()
|
||||
{
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#error CNTK requires the NVIDIA cuDNN library 5.0 or higher to build, cf. https://docs.microsoft.com/en-us/cognitive-toolkit/Setup-CNTK-on-Windows#cudnn or https://docs.microsoft.com/en-us/cognitive-toolkit/Setup-CNTK-on-Linux#cudnn for installation instructions.
|
||||
#endif
|
||||
#include <memory>
|
||||
#include "half.hpp"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -51,4 +52,11 @@ struct Consts
|
|||
static const ElemType One;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Consts<half>
|
||||
{
|
||||
static const float Zero;
|
||||
static const float One;
|
||||
};
|
||||
|
||||
} } }
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
|
@ -11,6 +10,10 @@
|
|||
#include <typeinfo>
|
||||
#include <typeindex>
|
||||
#include "CuDnnCommon.h"
|
||||
#include "half.hpp"
|
||||
|
||||
// We want tensor core be enabled in order to get(v7)/find tensor core results. But if algo without tensorcore is faster, the only way to force faster algo is to turn it off. Since re-tuning can happen quite often in CNTK, it gets bad if we don't do it carefully. It also require move to get_v7 and we can't test until we can run fp16.
|
||||
// For now, let's keep it simple and enable tensor core all the time for fp16.
|
||||
|
||||
template <>
|
||||
const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
|
||||
|
@ -26,11 +29,6 @@ const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
|
|||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
static bool IsGpu(DEVICEID_TYPE deviceId)
|
||||
{
|
||||
return deviceId >= 0;
|
||||
}
|
||||
|
||||
class CuDnnKernel
|
||||
{
|
||||
public:
|
||||
|
@ -54,6 +52,9 @@ public:
|
|||
// Set map count(aka K) dimension.
|
||||
dims[0] = (int)mapCount;
|
||||
dims[1] = (int)filt[filt_size - 1];
|
||||
int numElems = 1;
|
||||
for(int i=0; i<(int)dim_size;i++) numElems *= dims[i];
|
||||
m_isOdd = (numElems%2==1);
|
||||
CUDNN_CALL(cudnnSetFilterNdDescriptor(m_kernel, dataType, FILTER_FORMAT, (int)dim_size, dims.data()));
|
||||
}
|
||||
|
||||
|
@ -71,10 +72,16 @@ public:
|
|||
return m_kernel;
|
||||
}
|
||||
|
||||
bool isOdd()
|
||||
{
|
||||
return m_isOdd;
|
||||
}
|
||||
|
||||
DISABLE_COPY_AND_MOVE(CuDnnKernel);
|
||||
|
||||
private:
|
||||
cudnnFilterDescriptor_t m_kernel;
|
||||
bool m_isOdd;
|
||||
};
|
||||
|
||||
class CuDnnConv
|
||||
|
@ -102,7 +109,10 @@ public:
|
|||
}
|
||||
CUDNN_CALL(cudnnSetConvolutionNdDescriptor(m_conv, (int)dim_size, pad.data(),
|
||||
stride.data(), dilation.data(),
|
||||
CUDNN_CROSS_CORRELATION, dataType));
|
||||
CUDNN_CROSS_CORRELATION, dataType == CUDNN_DATA_HALF ? CUDNN_DATA_FLOAT : dataType));
|
||||
// allow tensor core for fp16 by default
|
||||
if(dataType == CUDNN_DATA_HALF)
|
||||
CUDNN_CALL(cudnnSetConvolutionMathType(m_conv, CUDNN_TENSOR_OP_MATH));
|
||||
}
|
||||
|
||||
~CuDnnConv()
|
||||
|
@ -306,6 +316,8 @@ protected:
|
|||
return err;
|
||||
};
|
||||
FindBestAlgo(batchSize, m_fwdAlgo, workspaceSizeFinder, deterministicFinder, finder, staticFinder, workspace);
|
||||
if(m_dataType == CUDNN_DATA_HALF) CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, m_fwdAlgo.AlgoMathType));
|
||||
else CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, CUDNN_DEFAULT_MATH));
|
||||
// Perform forward convolution operation.
|
||||
CUDNN_CALL(cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out)));
|
||||
}
|
||||
|
@ -369,6 +381,8 @@ protected:
|
|||
};
|
||||
FindBestAlgo(batchSize, m_backDataAlgo, workspaceSizeFinder, deterministicFinder, finder, staticFinder, workspace);
|
||||
// Compute gradients with respect to the output tensor (data).
|
||||
if(m_dataType == CUDNN_DATA_HALF) CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, m_backDataAlgo.AlgoMathType));
|
||||
else CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, CUDNN_DEFAULT_MATH));
|
||||
CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), accumulateGradient ? &C::One : &C::Zero, m_inT, ptr(grad)));
|
||||
}
|
||||
|
||||
|
@ -396,6 +410,15 @@ protected:
|
|||
{
|
||||
if(!noMem)
|
||||
return cudnnGetConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, workspace.BufferSize(), &algo);
|
||||
// special case for half/odd filter
|
||||
if(m_kernelT->isOdd() && m_dataType == CUDNN_DATA_HALF)
|
||||
{
|
||||
size_t tmpSize = 0;
|
||||
algo = (cudnnConvolutionBwdFilterAlgo_t) 1;
|
||||
auto err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, algo, &tmpSize);
|
||||
workspace.Resize((tmpSize + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
|
||||
return err;
|
||||
}
|
||||
return cudnnGetConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE, 0, &algo);
|
||||
};
|
||||
// find deterministic algorithm
|
||||
|
@ -431,6 +454,8 @@ protected:
|
|||
};
|
||||
FindBestAlgo(batchSize, m_backFiltAlgo, workspaceSizeFinder, deterministicFinder, finder, staticFinder, workspace);
|
||||
// Compute gradients with respect to the output tensor (data).
|
||||
if(m_dataType == CUDNN_DATA_HALF) CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, m_backFiltAlgo.AlgoMathType));
|
||||
else CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, CUDNN_DEFAULT_MATH));
|
||||
CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), accumulateGradient ? &C::One : &C::Zero, *m_kernelT, ptr(kernelGrad)));
|
||||
}
|
||||
|
||||
|
@ -556,6 +581,7 @@ private:
|
|||
assert(calgo > 0);
|
||||
auto res = algoPerf; // first returned algorithm is the fastest
|
||||
algo.RecordAlgoBatchSizeWorkspaceSize(true, (*res).algo, batchSize, (*res).memory);
|
||||
algo.AlgoMathType = (*res).mathType;
|
||||
algo.autotuningState = AutotuningState::Running;
|
||||
if (algo.MaxAlgoWorkspaceSize < curSize) // need to shrink the workspace
|
||||
workspace.Resize((curSize + sizeof(ElemType) - 1) / sizeof(ElemType), 1, 0, false);
|
||||
|
@ -573,6 +599,7 @@ private:
|
|||
assert(calgo > 0);
|
||||
auto res = algoPerf; // first returned algorithm is the fastest
|
||||
algo.RecordAlgoBatchSizeWorkspaceSize(true, (*res).algo, batchSize, (*res).memory);
|
||||
algo.AlgoMathType = (*res).mathType;
|
||||
algo.autotuningState = AutotuningState::Running;
|
||||
}
|
||||
catch (...)
|
||||
|
@ -613,7 +640,7 @@ private:
|
|||
{
|
||||
typedef T typeT;
|
||||
ConvAlgoInfo()
|
||||
: LastBatchAlgoMBSize(0), MaxAlgoMBSize(0), maxMBSizeSeen(0), autotuningState(AutotuningState::Init), MaxAlgoWorkspaceSize(0), LastBatchAlgoWorkspaceSize(0)
|
||||
: LastBatchAlgoMBSize(0), MaxAlgoMBSize(0), maxMBSizeSeen(0), autotuningState(AutotuningState::Init), MaxAlgoWorkspaceSize(0), LastBatchAlgoWorkspaceSize(0), AlgoMathType(CUDNN_TENSOR_OP_MATH)
|
||||
{
|
||||
}
|
||||
// Variables to stores states
|
||||
|
@ -631,6 +658,8 @@ private:
|
|||
decltype(T::algo) selectedAlgo; // currently selected algorithm
|
||||
decltype(T::algo) maxAlgo; // algorithm that was selected when the current workspace is allocated
|
||||
|
||||
cudnnMathType_t AlgoMathType;
|
||||
|
||||
bool NeedAutotuning(size_t batchSize, size_t workspaceSize)
|
||||
{
|
||||
// NVIDIA:
|
||||
|
@ -741,5 +770,6 @@ bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId
|
|||
|
||||
template class CuDnnConvolutionEngineFactory<float>;
|
||||
template class CuDnnConvolutionEngineFactory<double>;
|
||||
template class CuDnnConvolutionEngineFactory<half>;
|
||||
|
||||
} } }
|
||||
|
|
|
@ -21,11 +21,11 @@ public:
|
|||
static bool IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry, PoolKind poolKind);
|
||||
};
|
||||
|
||||
template <class ElemType>
|
||||
template <class InoutType, class StatType>
|
||||
class CuDnnBatchNormEngineFactory
|
||||
{
|
||||
public:
|
||||
static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
static std::unique_ptr<BatchNormEngine<InoutType, StatType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout);
|
||||
};
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <typeindex>
|
||||
#include "CuDnnCommon.h"
|
||||
#include "CuDnnRNN.h"
|
||||
#include "half.hpp"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -172,5 +173,6 @@ void CuDnnRNNExecutor<ElemType>::BackwardWeightsCore(const GPUMatrix<ElemType>&
|
|||
|
||||
template class CuDnnRNNExecutor<double>;
|
||||
template class CuDnnRNNExecutor<float>;
|
||||
template class CuDnnRNNExecutor<half>;
|
||||
|
||||
} } }
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче