This commit is contained in:
Frank Seide 2017-02-20 18:49:20 -08:00
Родитель 6042d0699f da7ce0aa8f
Коммит 65bf17f4f4
171 изменённых файлов: 26582 добавлений и 2293 удалений

Просмотреть файл

@ -119,6 +119,12 @@
<LinkIncremental>$(DebugBuild)</LinkIncremental>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<PreprocessorDefinitions>HAS_MPI=1</PreprocessorDefinitions>
</ClCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(ConfigurationType)' == 'StaticLibrary'">
<ClCompile>
<WarningLevel>Level4</WarningLevel>

Просмотреть файл

@ -1484,7 +1484,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BrainScript", "BrainScript"
Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalExamplesTest", "Tests\EndToEndTests\EvalClientTests\CNTKLibraryCPPEvalExamplesTest\CNTKLibraryCPPEvalExamplesTest.vcxproj.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalExamplesTest", "Tests\EndToEndTests\EvalClientTests\CNTKLibraryCPPEvalExamplesTest\CNTKLibraryCPPEvalExamplesTest.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
ProjectSection(ProjectDependencies) = postProject
{91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715}
{E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF}

Просмотреть файл

@ -0,0 +1,20 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// CNTKLibraryCPPEvalCPUOnlyExamples.cpp : Sample application shows how to evaluate a model using CNTK V2 API.
//
#include <stdio.h>
void MultiThreadsEvaluation(bool);
int main()
{
fprintf(stderr, "\n##### Run CNTKLibraryCPPEvalCPUOnlyExamples on CPU. #####\n");
MultiThreadsEvaluation(false);
fprintf(stderr, "Evaluation complete.\n");
fflush(stderr);
}

Просмотреть файл

@ -1,20 +1,27 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="CNTKLibraryCPPEvalExamples.cpp" />
<ClCompile Include="CNTKLibraryCPPEvalCPUOnlyExamples.cpp" />
<ClCompile Include="EvalMultithreads.cpp" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{D771A06D-CC25-4582-B5CD-D2A4782BB005}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>CNTKLibraryCPPEvalExamples</RootNamespace>
<ProjectName>CNTKLibraryCPPEvalExamples</ProjectName>
<RootNamespace>CNTKLibraryCPPEvalCPUOnlyExamples</RootNamespace>
<ProjectName>CNTKLibraryCPPEvalCPUOnlyExamples</ProjectName>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
@ -24,6 +31,13 @@
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>false</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
@ -31,12 +45,14 @@
<PropertyGroup>
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level4</WarningLevel>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<PreprocessorDefinitions>WIN32;_CONSOLE;UNICODE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(SolutionDir)..\..\Include</AdditionalIncludeDirectories>
<TreatWarningAsError>true</TreatWarningAsError>
<SDLCheck>true</SDLCheck>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
@ -45,8 +61,6 @@
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalLibraryDirectories>$(SolutionDir)\..\..\cntk</AdditionalLibraryDirectories>
<AdditionalDependencies>CNTKLibrary-2.0.lib;%(AdditionalDependencies)</AdditionalDependencies>
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
</Link>
</ItemDefinitionGroup>
@ -55,7 +69,6 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
<RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MultiThreaded</RuntimeLibrary>
@ -66,7 +79,32 @@
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<Optimization>Disabled</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
<RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MultiThreadedDebug</RuntimeLibrary>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<MinimalRebuild>false</MinimalRebuild>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets" Condition="Exists('..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets')" />
</ImportGroup>
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
<PropertyGroup>
<ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
</PropertyGroup>
<Error Condition="!Exists('..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets'))" />
</Target>
</Project>

Просмотреть файл

@ -15,11 +15,14 @@
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="CNTKLibraryCPPEvalExamples.cpp">
<ClCompile Include="CNTKLibraryCPPEvalCPUOnlyExamples.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="EvalMultithreads.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="CNTK.CPUOnly" version="2.0-beta11" targetFramework="native" />
</packages>

Просмотреть файл

@ -1,30 +0,0 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// CNTKLibraryCPPevalExamples.cpp : Sample application shows how to evaluate a model using CNTK V2 API.
//
#include <stdio.h>
// define GPU_AVAILABLE, if you want to run evaluation on a GPU device. You also need CNTK GPU binaries.
// undefine GPU_AVAILABLE, if you want to run evaluation on a CPU device.
// #define GPU_AVAILABLE
void MultiThreadsEvaluation(bool);
int main()
{
#ifdef GPU_AVAILABLE
fprintf(stderr, "\n##### Run eval on GPU device. #####\n");
MultiThreadsEvaluation(true);
#else
fprintf(stderr, "\n##### Run eval on CPU device. #####\n");
MultiThreadsEvaluation(false);
#endif
fprintf(stderr, "Evaluation complete.\n");
fflush(stderr);
}

Просмотреть файл

@ -0,0 +1,20 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// CNTKLibraryCPPEvalGPUExamples.cpp : Sample application shows how to evaluate a model using CNTK V2 API.
//
#include <stdio.h>
void MultiThreadsEvaluation(bool);
int main()
{
fprintf(stderr, "\n##### Run CNTKLibraryCPPEvalGPUExamples on CPU and GPU. #####\n");
MultiThreadsEvaluation(true);
fprintf(stderr, "Evaluation complete.\n");
fflush(stderr);
}

Просмотреть файл

@ -0,0 +1,110 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\CNTKLibraryCPPEvalCPUOnlyExamples\EvalMultithreads.cpp" />
<ClCompile Include="CNTKLibraryCPPEvalGPUExamples.cpp" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{13489884-3A6A-4023-8CF1-D8C78DDAF952}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>CNTKLibraryCPPEvalGPUExamples</RootNamespace>
<ProjectName>CNTKLibraryCPPEvalGPUExamples</ProjectName>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>false</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level4</WarningLevel>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<PreprocessorDefinitions>WIN32;_CONSOLE;UNICODE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<TreatWarningAsError>true</TreatWarningAsError>
<SDLCheck>true</SDLCheck>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<FloatingPointModel>Fast</FloatingPointModel>
<OpenMPSupport>true</OpenMPSupport>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
<RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<Optimization>Disabled</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
<RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MultiThreadedDebug</RuntimeLibrary>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<MinimalRebuild>false</MinimalRebuild>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets" Condition="Exists('..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets')" />
</ImportGroup>
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
<PropertyGroup>
<ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
</PropertyGroup>
<Error Condition="!Exists('..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets'))" />
</Target>
</Project>

Просмотреть файл

@ -0,0 +1,28 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\CNTKLibraryCPPEvalCPUOnlyExamples\EvalMultithreads.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="CNTKLibraryCPPEvalGPUExamples.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="CNTK.GPU" version="2.0-beta11" targetFramework="native" />
</packages>

Просмотреть файл

@ -39,7 +39,7 @@
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<ItemGroup>
<Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, processorArchitecture=AMD64">
<Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, PublicKeyToken=21fff2ec8197defe, processorArchitecture=AMD64">
<HintPath>..\packages\CNTK.CPUOnly.2.0-beta11\lib\net45\x64\CNTKLibraryManaged-2.0.dll</HintPath>
<Private>True</Private>
</Reference>
@ -72,4 +72,4 @@
<Target Name="AfterBuild">
</Target>
-->
</Project>
</Project>

Просмотреть файл

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="CNTK.CPUOnly" version="2.0-beta11" targetFramework="net45" />
</packages>
</packages>

Просмотреть файл

@ -39,7 +39,7 @@
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<ItemGroup>
<Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, processorArchitecture=AMD64">
<Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, PublicKeyToken=a82c1f3f67b62253, processorArchitecture=AMD64">
<HintPath>..\packages\CNTK.GPU.2.0-beta11\lib\net45\x64\CNTKLibraryManaged-2.0.dll</HintPath>
<Private>True</Private>
</Reference>
@ -76,4 +76,4 @@
<Target Name="AfterBuild">
</Target>
-->
</Project>
</Project>

Просмотреть файл

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="CNTK.GPU" version="2.0-beta11" targetFramework="net45" />
</packages>
</packages>

Просмотреть файл

@ -3,19 +3,22 @@ Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
VisualStudioVersion = 14.0.25420.1
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalExamples", "CNTKLibraryCPPEvalExamples\CNTKLibraryCPPEvalExamples.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalCPUOnlyExamples", "CNTKLibraryCPPEvalCPUOnlyExamples\CNTKLibraryCPPEvalCPUOnlyExamples.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSEvalCPUOnlyExamples", "CNTKLibraryCSEvalCPUOnlyExamples\CNTKLibraryCSEvalCPUOnlyExamples.csproj", "{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSEvalGPUExamples", "CNTKLibraryCSEvalGPUExamples\CNTKLibraryCSEvalGPUExamples.csproj", "{307E5BAC-DA03-45D2-ADEC-FE6620090109}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalGPUExamples", "CNTKLibraryCPPEvalGPUExamples\CNTKLibraryCPPEvalGPUExamples.vcxproj", "{13489884-3A6A-4023-8CF1-D8C78DDAF952}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.ActiveCfg = Release|x64
{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.ActiveCfg = Debug|x64
{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.Build.0 = Debug|x64
{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.ActiveCfg = Release|x64
{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.Build.0 = Release|x64
{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}.Debug|x64.ActiveCfg = Debug|x64
@ -26,6 +29,10 @@ Global
{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Debug|x64.Build.0 = Debug|x64
{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Release|x64.ActiveCfg = Release|x64
{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Release|x64.Build.0 = Release|x64
{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Debug|x64.ActiveCfg = Debug|x64
{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Debug|x64.Build.0 = Debug|x64
{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Release|x64.ActiveCfg = Release|x64
{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

Просмотреть файл

@ -1,6 +1,10 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
@ -20,17 +24,30 @@
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level4</WarningLevel>
@ -38,29 +55,65 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(SolutionDir)..\..\Include</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<TreatWarningAsError>true</TreatWarningAsError>
<SDLCheck>true</SDLCheck>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<FloatingPointModel>Fast</FloatingPointModel>
<OpenMPSupport>true</OpenMPSupport>
<UseDebugLibraries>false</UseDebugLibraries>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalLibraryDirectories>$(SolutionDir)\..\..\cntk</AdditionalLibraryDirectories>
<AdditionalDependencies>EvalDll.lib;%(AdditionalDependencies)</AdditionalDependencies>
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
<Profile>true</Profile>
<Profile>false</Profile>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level4</WarningLevel>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<Optimization>Disabled</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<TreatWarningAsError>true</TreatWarningAsError>
<SDLCheck>true</SDLCheck>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<FloatingPointModel>Fast</FloatingPointModel>
<OpenMPSupport>true</OpenMPSupport>
<UseDebugLibraries>true</UseDebugLibraries>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
<Profile>false</Profile>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="CPPEvalClient.cpp" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets" Condition="Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" />
</ImportGroup>
</Project>
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
<PropertyGroup>
<ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
</PropertyGroup>
<Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
</Target>
</Project>

Просмотреть файл

@ -19,4 +19,7 @@
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
</Project>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="Microsoft.Research.CNTK.CpuEval-mkl" version="2.0-beta11" targetFramework="native" />
</packages>

Просмотреть файл

@ -1,6 +1,10 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
@ -20,17 +24,30 @@
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level4</WarningLevel>
@ -38,29 +55,63 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(SolutionDir)..\..\Include</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<TreatWarningAsError>true</TreatWarningAsError>
<SDLCheck>true</SDLCheck>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<FloatingPointModel>Fast</FloatingPointModel>
<OpenMPSupport>true</OpenMPSupport>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalLibraryDirectories>$(SolutionDir)\..\..\cntk</AdditionalLibraryDirectories>
<AdditionalDependencies>EvalDll.lib;%(AdditionalDependencies)</AdditionalDependencies>
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
<Profile>true</Profile>
<Profile>false</Profile>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level4</WarningLevel>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<Optimization>Disabled</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<TreatWarningAsError>true</TreatWarningAsError>
<SDLCheck>true</SDLCheck>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<FloatingPointModel>Fast</FloatingPointModel>
<OpenMPSupport>true</OpenMPSupport>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
<Profile>false</Profile>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="CPPEvalExtendedClient.cpp" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets" Condition="Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" />
</ImportGroup>
</Project>
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
<PropertyGroup>
<ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
</PropertyGroup>
<Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
</Target>
</Project>

Просмотреть файл

@ -19,4 +19,7 @@
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="Microsoft.Research.CNTK.CpuEval-mkl" version="2.0-beta11" targetFramework="native" />
</packages>

Просмотреть файл

@ -95,4 +95,4 @@
</PropertyGroup>
<Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
</Target>
</Project>
</Project>

Просмотреть файл

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="Microsoft.Research.CNTK.CpuEval-mkl" version="2.0-beta11" targetFramework="net45" />
</packages>
</packages>

Просмотреть файл

@ -15,14 +15,16 @@ Global
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Debug|x64.ActiveCfg = Release|x64
{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Debug|x64.ActiveCfg = Debug|x64
{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Debug|x64.Build.0 = Debug|x64
{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Release|x64.ActiveCfg = Release|x64
{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Release|x64.Build.0 = Release|x64
{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Debug|x64.ActiveCfg = Debug|x64
{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Debug|x64.Build.0 = Debug|x64
{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Release|x64.ActiveCfg = Release|x64
{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Release|x64.Build.0 = Release|x64
{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Debug|x64.ActiveCfg = Release|x64
{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Debug|x64.ActiveCfg = Debug|x64
{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Debug|x64.Build.0 = Debug|x64
{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Release|x64.ActiveCfg = Release|x64
{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Release|x64.Build.0 = Release|x64
EndGlobalSection

Просмотреть файл

@ -16,6 +16,7 @@ from cntk.utils import *
from cntk.ops import *
from cntk.distributed import data_parallel_distributed_learner, Communicator
from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
import cntk.io.transforms as xforms
from cntk.layers import Placeholder, Block, Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options, Sequential
from cntk.initializer import normal
@ -41,15 +42,15 @@ def create_image_mb_source(map_file, is_training, total_number_of_samples):
transforms = []
if is_training:
transforms += [
ImageDeserializer.crop(crop_type='randomside', side_ratio=0.88671875, jitter_type='uniratio') # train uses jitter
xforms.crop(crop_type='randomside', side_ratio=0.88671875, jitter_type='uniratio') # train uses jitter
]
else:
else:
transforms += [
ImageDeserializer.crop(crop_type='center', side_ratio=0.88671875) # test has no jitter
xforms.crop(crop_type='center', side_ratio=0.88671875) # test has no jitter
]
transforms += [
ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
]
# deserializer
@ -57,27 +58,27 @@ def create_image_mb_source(map_file, is_training, total_number_of_samples):
ImageDeserializer(map_file, StreamDefs(
features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
labels = StreamDef(field='label', shape=num_classes))), # and second as 'label'
randomize = is_training,
randomize = is_training,
epoch_size=total_number_of_samples,
multithreaded_deserializer = True)
# Local Response Normalization layer. See Section 3.3 of the paper:
# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
# The mathematical equation is:
# Local Response Normalization layer. See Section 3.3 of the paper:
# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
# The mathematical equation is:
# b_{x,y}^i=a_{x,y}^i/(k+\alpha\sum_{j=max(0,i-n)}^{min(N-1, i+n)}(a_{x,y}^j)^2)^\beta
# where a_{x,y}^i is the activity of a neuron comoputed by applying kernel i at position (x,y)
# N is the total number of kernals, n is half normalization width.
def LocalResponseNormalization(k, n, alpha, beta, name=''):
x = cntk.blocks.Placeholder(name='lrn_arg')
x2 = cntk.ops.square(x)
# reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed.
# N is the total number of kernals, n is half normalization width.
def LocalResponseNormalization(k, n, alpha, beta, name=''):
x = cntk.blocks.Placeholder(name='lrn_arg')
x2 = cntk.ops.square(x)
# reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed.
x2s = cntk.ops.reshape(x2, (1, cntk.InferredDimension), 0, 1)
W = cntk.ops.constant(alpha/(2*n+1), (1,2*n+1,1,1), name='W')
# 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
y = cntk.ops.convolution (W, x2s)
# reshape back to remove the fake singleton reduction dimension
b = cntk.ops.reshape(y, cntk.InferredDimension, 0, 2)
den = cntk.ops.exp(beta * cntk.ops.log(k + b))
den = cntk.ops.exp(beta * cntk.ops.log(k + b))
apply_x = cntk.ops.element_divide(x, den)
return cntk.blocks.Block(apply_x, 'LocalResponseNormalization', name, make_block=True)
@ -89,35 +90,35 @@ def create_alexnet():
label_var = input_variable((num_classes))
# apply model to input
# remove mean value
# remove mean value
input = minus(feature_var, constant(114), name='mean_removed_input')
with default_options(activation=None, pad=True, bias=True):
z = Sequential([
# we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU)
Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'),
Activation(activation=relu, name='relu1'),
# we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU)
Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'),
Activation(activation=relu, name='relu1'),
LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm1'),
MaxPooling((3,3), (2,2), name='pool1'),
Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'),
Activation(activation=relu, name='relu2'),
Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'),
Activation(activation=relu, name='relu2'),
LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm2'),
MaxPooling((3,3), (2,2), name='pool2'),
Convolution2D((3,3), 384, init=normal(0.01), name='conv3'),
Activation(activation=relu, name='relu3'),
Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'),
Activation(activation=relu, name='relu4'),
Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'),
Activation(activation=relu, name='relu5'),
MaxPooling((3,3), (2,2), name='pool5'),
Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'),
Activation(activation=relu, name='relu6'),
Dropout(0.5, name='drop6'),
Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'),
Activation(activation=relu, name='relu7'),
Convolution2D((3,3), 384, init=normal(0.01), name='conv3'),
Activation(activation=relu, name='relu3'),
Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'),
Activation(activation=relu, name='relu4'),
Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'),
Activation(activation=relu, name='relu5'),
MaxPooling((3,3), (2,2), name='pool5'),
Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'),
Activation(activation=relu, name='relu6'),
Dropout(0.5, name='drop6'),
Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'),
Activation(activation=relu, name='relu7'),
Dropout(0.5, name='drop7'),
Dense(num_classes, init=normal(0.01), name='fc8')
])(input)
@ -134,7 +135,7 @@ def create_alexnet():
'label': label_var,
'ce' : ce,
'pe' : pe,
'pe5': pe5,
'pe5': pe5,
'output': z
}
@ -145,10 +146,10 @@ def create_trainer(network, epoch_size, num_quantization_bits):
lr_schedule = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
mm_schedule = cntk.learner.momentum_schedule(0.9)
l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
# Create learner
local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight)
# Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency
# Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency
parameter_learner = data_parallel_distributed_learner(
local_learner,
num_quantization_bits=num_quantization_bits,
@ -167,25 +168,25 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
}
training_session = cntk.training_session(
training_minibatch_source = train_source,
training_minibatch_source = train_source,
trainer = trainer,
model_inputs_to_mb_source_mapping = input_map,
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
progress_printer = progress_printer,
model_inputs_to_mb_source_mapping = input_map,
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
progress_printer = progress_printer,
# checkpoint_frequency = epoch_size,
checkpoint_filename = os.path.join(model_path, model_name),
checkpoint_filename = os.path.join(model_path, model_name),
# save_all_checkpoints = True,
progress_frequency = epoch_size,
cv_source = test_source,
progress_frequency = epoch_size,
cv_source = test_source,
cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
# cv_frequency = epoch_size,
restore = restore)
# Train all minibatches
# Train all minibatches
training_session.train()
# Train and evaluate the network.
def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=256, epoch_size = 1281167, max_epochs=112,
def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=256, epoch_size = 1281167, max_epochs=112,
restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=True):
_cntk_py.set_computation_network_trace_level(0)
@ -202,10 +203,10 @@ def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, mini
train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
if __name__=='__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=data_path)
@ -233,8 +234,8 @@ if __name__=='__main__':
test_data=os.path.join(data_path, 'val_map.txt')
try:
alexnet_train_and_eval(train_data, test_data,
minibatch_size=args['minibatch_size'],
alexnet_train_and_eval(train_data, test_data,
minibatch_size=args['minibatch_size'],
epoch_size=args['epoch_size'],
num_quantization_bits=args['quantized_bits'],
max_epochs=args['num_epochs'],
@ -243,4 +244,4 @@ if __name__=='__main__':
num_mbs_per_log=200,
gen_heartbeat=True)
finally:
cntk.distributed.Communicator.finalize()
cntk.distributed.Communicator.finalize()

Просмотреть файл

@ -32,7 +32,7 @@ TrainConvNet = {
x2s = SplitDimension(x2, 3, 1)
# 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
W = ParameterTensor{(1:1:2*n+1:1), learningRateMultiplier = 0, initValue = alpha/(2*n+1)}
y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose = false, maxTempMemSizeInSamples = 0)
y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, maxTempMemSizeInSamples = 0)
# reshape back to remove the fake singleton reduction dimension
b = FlattenDimensions(y, 3, 2)
den = Exp (beta .* Log(k + b))

Просмотреть файл

@ -10,8 +10,9 @@ import math
import numpy as np
import cntk
import _cntk_py
import cntk.io.transforms as xforms
# Paths relative to current python file.
# Paths relative to current python file.
abs_path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10")
model_path = os.path.join(abs_path, "Models")
@ -32,11 +33,11 @@ def create_reader(map_file, mean_file, is_training):
transforms = []
if is_training:
transforms += [
cntk.io.ImageDeserializer.crop(crop_type='RandomSide', side_ratio=0.8, jitter_type='uniRatio') # train uses jitter
xforms.crop(crop_type='RandomSide', side_ratio=0.8, jitter_type='uniRatio') # train uses jitter
]
transforms += [
cntk.io.ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
cntk.io.ImageDeserializer.mean(mean_file)
xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
xforms.mean(mean_file)
]
# deserializer
return cntk.io.MinibatchSource(cntk.io.ImageDeserializer(map_file, cntk.io.StreamDefs(
@ -44,23 +45,23 @@ def create_reader(map_file, mean_file, is_training):
labels = cntk.io.StreamDef(field='label', shape=num_classes))), # and second as 'label'
randomize=is_training)
# Local Response Normalization layer. See Section 3.3 of the paper:
# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
# The mathematical equation is:
# Local Response Normalization layer. See Section 3.3 of the paper:
# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
# The mathematical equation is:
# b_{x,y}^i=a_{x,y}^i/(k+\alpha\sum_{j=max(0,i-n)}^{min(N-1, i+n)}(a_{x,y}^j)^2)^\beta
# where a_{x,y}^i is the activity of a neuron comoputed by applying kernel i at position (x,y)
# N is the total number of kernals, n is half normalization width.
def LocalResponseNormalization(k, n, alpha, beta, name=''):
x = cntk.blocks.Placeholder(name='lrn_arg')
x2 = cntk.ops.square(x)
# reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed.
# N is the total number of kernals, n is half normalization width.
def LocalResponseNormalization(k, n, alpha, beta, name=''):
x = cntk.blocks.Placeholder(name='lrn_arg')
x2 = cntk.ops.square(x)
# reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed.
x2s = cntk.ops.reshape(x2, (1, cntk.InferredDimension), 0, 1)
W = cntk.ops.constant(alpha/(2*n+1), (1,2*n+1,1,1), name='W')
# 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
y = cntk.ops.convolution (W, x2s)
# reshape back to remove the fake singleton reduction dimension
b = cntk.ops.reshape(y, cntk.InferredDimension, 0, 2)
den = cntk.ops.exp(beta * cntk.ops.log(k + b))
den = cntk.ops.exp(beta * cntk.ops.log(k + b))
apply_x = cntk.ops.element_divide(x, den)
return cntk.blocks.Block(apply_x, 'LocalResponseNormalization', name, make_block=True)
@ -75,18 +76,18 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
# apply model to input
scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var)
with cntk.layers.default_options (activation=cntk.ops.relu, pad=True):
with cntk.layers.default_options (activation=cntk.ops.relu, pad=True):
z = cntk.models.Sequential([
cntk.models.For(range(2), lambda : [
cntk.layers.Convolution2D((3,3), 64),
cntk.layers.Convolution2D((3,3), 64),
cntk.layers.Convolution2D((3,3), 64),
cntk.layers.Convolution2D((3,3), 64),
LocalResponseNormalization (1.0, 4, 0.001, 0.75),
cntk.layers.MaxPooling((3,3), (2,2))
]),
]),
cntk.models.For(range(2), lambda i: [
cntk.layers.Dense([256,128][i]),
cntk.layers.Dense([256,128][i]),
cntk.layers.Dropout(0.5)
]),
]),
cntk.layers.Dense(num_classes, activation=None)
])(scaled_input)
@ -103,7 +104,7 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
mm_time_constant = [0]*20 + [600]*20 + [1200]
mm_schedule = cntk.learner.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
l2_reg_weight = 0.002
# trainer object
learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule,
unit_gain = True,
@ -117,7 +118,7 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
}
cntk.utils.log_number_of_parameters(z) ; print()
progress_printer = cntk.utils.ProgressPrinter(tag='Training')
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
# perform model training
for epoch in range(max_epochs): # loop over epochs
@ -130,7 +131,7 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
progress_printer.epoch_summary(with_metric=True)
z.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch)))
### Evaluation action
epoch_size = 10000
minibatch_size = 16

Просмотреть файл

@ -84,10 +84,10 @@ def convnet_cifar10(debug_output=False):
}
cntk.utils.log_number_of_parameters(z) ; print()
progress_printer = cntk.utils.ProgressPrinter(tag='Training')
max_epochs = 30
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
# Get minibatches of images to train with and perform model training
max_epochs = 30
for epoch in range(max_epochs): # loop over epochs
sample_count = 0
while sample_count < epoch_size: # loop over minibatches in the epoch

Просмотреть файл

@ -8,6 +8,9 @@ from __future__ import print_function
import os
import math
import numpy as np
import cntk
import _cntk_py
import cntk.io.transforms as xforms
from cntk.layers import Convolution2D, MaxPooling, AveragePooling, Dropout, BatchNormalization, Dense, default_options, Placeholder, identity, Sequential, For
from cntk.layers.typing import *
@ -47,11 +50,11 @@ def create_reader(map_file, mean_file, is_training):
transforms = []
if is_training:
transforms += [
ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
]
transforms += [
ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
ImageDeserializer.mean(mean_file)
xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
xforms.mean(mean_file)
]
# deserializer
return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
@ -142,6 +145,10 @@ def train_and_evaluate(reader, reader_test, model, epoch_size=50000, max_epochs=
# TODO: we should be done here
#return metric_numer/metric_denom
progress_printer.epoch_summary(with_metric=True)
z.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch)))
### Evaluation action
# evaluate with current Trainer instance; just to make sure we save and load the model correctly and BN works now --TODO: delete once confirmed
epoch_size = 10000

Просмотреть файл

@ -11,6 +11,7 @@ import argparse
import numpy as np
import cntk
import _cntk_py
import cntk.io.transforms as xforms
# default Paths relative to current python file.
abs_path = os.path.dirname(os.path.abspath(__file__))
@ -32,12 +33,12 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
transforms = []
if train:
transforms += [
cntk.io.ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
]
transforms += [
cntk.io.ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
cntk.io.ImageDeserializer.mean(mean_file)
xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
xforms.mean(mean_file)
]
# deserializer
@ -45,7 +46,7 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
cntk.io.ImageDeserializer(map_file, cntk.io.StreamDefs(
features = cntk.io.StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
labels = cntk.io.StreamDef(field='label', shape=num_classes))), # and second as 'label'
randomize=train,
randomize=train,
epoch_size=total_number_of_samples,
multithreaded_deserializer = True)
@ -58,18 +59,18 @@ def create_conv_network():
# apply model to input
scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), feature_var)
with cntk.layers.default_options(activation=cntk.ops.relu, pad=True):
z = cntk.models.Sequential([
cntk.models.For(range(2), lambda : [
cntk.layers.Convolution2D((3,3), 64),
cntk.layers.Convolution2D((3,3), 64),
cntk.layers.MaxPooling((3,3), (2,2))
]),
]),
cntk.models.For(range(2), lambda i: [
cntk.layers.Dense([256,128][i]),
cntk.layers.Dense([256,128][i]),
cntk.layers.Dropout(0.5)
]),
]),
cntk.layers.Dense(num_classes, activation=None)
])(scaled_input)
@ -96,13 +97,13 @@ def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_
mm_time_constant = [0]*20 + [600]*20 + [1200]
mm_schedule = cntk.learner.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
l2_reg_weight = 0.002
# Create learner
if block_size != None and num_quantization_bits != 32:
raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.")
local_learner = cntk.learner.momentum_sgd(network['output'].parameters,
lr_schedule, mm_schedule,
local_learner = cntk.learner.momentum_sgd(network['output'].parameters,
lr_schedule, mm_schedule,
l2_regularization_weight=l2_reg_weight)
if block_size != None:
@ -125,12 +126,12 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
training_session = cntk.training_session(
training_minibatch_source = train_source,
trainer = trainer,
model_inputs_to_mb_source_mapping = input_map,
model_inputs_to_mb_source_mapping = input_map,
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
progress_printer = progress_printer,
checkpoint_frequency = epoch_size,
progress_printer = progress_printer,
checkpoint_frequency = epoch_size,
checkpoint_filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"),
# save_all_checkpoints = False,
# save_all_checkpoints = False,
progress_frequency=epoch_size,
cv_source = test_source,
cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size),
@ -147,8 +148,8 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
cntk.stop_profiler()
# Train and evaluate the network.
def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64, epoch_size=50000, num_quantization_bits=32,
block_size=3200, warm_up=0, max_epochs=2, restore=False, log_to_file=None,
def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64, epoch_size=50000, num_quantization_bits=32,
block_size=3200, warm_up=0, max_epochs=2, restore=False, log_to_file=None,
num_mbs_per_log=None, gen_heartbeat=False, profiling=False):
_cntk_py.set_computation_network_trace_level(0)
@ -165,10 +166,10 @@ def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64,
train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore, profiling)
if __name__=='__main__':
parser = argparse.ArgumentParser()
data_path = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10")
@ -201,8 +202,8 @@ if __name__=='__main__':
test_data=os.path.join(data_path, 'test_map.txt')
try:
convnet_cifar10_dataaug(train_data, test_data, mean_data,
minibatch_size=args['minibatch_size'],
convnet_cifar10_dataaug(train_data, test_data, mean_data,
minibatch_size=args['minibatch_size'],
epoch_size=args['epoch_size'],
num_quantization_bits=args['quantized_bits'],
block_size=args['block_samples'],

Просмотреть файл

@ -74,10 +74,10 @@ def convnet_mnist(debug_output=False):
}
cntk.utils.log_number_of_parameters(z) ; print()
progress_printer = cntk.utils.ProgressPrinter(tag='Training')
max_epochs = 40
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
# Get minibatches of images to train with and perform model training
max_epochs = 40
for epoch in range(max_epochs): # loop over epochs
sample_count = 0
while sample_count < epoch_size: # loop over minibatches in the epoch

Просмотреть файл

@ -13,7 +13,6 @@ modelPath = "$outputDir$/Models/ResNet_101"
stderr = "$outputDir$/ResNet_101_BS_out"
parallelTrain = true
hyperCompressMemory = true
TrainNetwork = {
action = "train"

Просмотреть файл

@ -13,7 +13,6 @@ modelPath = "$outputDir$/Models/ResNet_152"
stderr = "$outputDir$/ResNet_152_BS_out"
parallelTrain = true
hyperCompressMemory = true
TrainNetwork = {
action = "train"

Просмотреть файл

@ -13,6 +13,7 @@ import numpy as np
from cntk.utils import *
from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error
from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs
import cntk.io.transforms as xforms
from cntk import Trainer, cntk_py
from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType
from _cntk_py import set_computation_network_trace_level
@ -40,11 +41,11 @@ def create_reader(map_file, mean_file, train):
transforms = []
if train:
transforms += [
ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
]
transforms += [
ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
ImageDeserializer.mean(mean_file)
xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
xforms.mean(mean_file)
]
# deserializer
return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
@ -61,21 +62,21 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
input_var = input_variable((num_channels, image_height, image_width))
label_var = input_variable((num_classes))
# create model, and configure learning parameters
if network_name == 'resnet20':
# create model, and configure learning parameters
if network_name == 'resnet20':
z = create_cifar10_model(input_var, 3, num_classes)
lr_per_mb = [1.0]*80+[0.1]*40+[0.01]
elif network_name == 'resnet110':
elif network_name == 'resnet110':
z = create_cifar10_model(input_var, 18, num_classes)
lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01]
else:
else:
return RuntimeError("Unknown model name!")
# loss and metric
ce = cross_entropy_with_softmax(z, label_var)
pe = classification_error(z, label_var)
# shared training parameters
# shared training parameters
minibatch_size = 128
momentum_time_constant = -minibatch_size/np.log(0.9)
l2_reg_weight = 0.0001
@ -84,7 +85,7 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
lr_per_sample = [lr/minibatch_size for lr in lr_per_mb]
lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)
# trainer object
learner = momentum_sgd(z.parameters, lr_schedule, mm_schedule,
l2_regularization_weight = l2_reg_weight)
@ -97,13 +98,13 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
}
log_number_of_parameters(z) ; print()
progress_printer = ProgressPrinter(tag='Training')
progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
# perform model training
if profiler_dir:
start_profiler(profiler_dir, True)
for epoch in range(max_epochs): # loop over epochs
sample_count = 0
while sample_count < epoch_size: # loop over minibatches in the epoch
@ -114,10 +115,10 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
progress_printer.epoch_summary(with_metric=True)
z.save(os.path.join(model_path, network_name + "_{}.dnn".format(epoch)))
enable_profiler() # begin to collect profiler data after first epoch
if profiler_dir:
stop_profiler()
# Evaluation parameters
test_epoch_size = 10000
minibatch_size = 16
@ -154,7 +155,7 @@ if __name__=='__main__':
args = vars(parser.parse_args())
epochs = int(args['epochs'])
network_name = args['network']
reader_train = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True)
reader_test = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False)

Просмотреть файл

@ -26,7 +26,6 @@ ImageC = 3
NumLabels = 1000
parallelTrain = true
hyperCompressMemory = true
################################
Train = {

Просмотреть файл

@ -26,7 +26,6 @@ ImageC = 3
NumLabels = 1000
parallelTrain = true
hyperCompressMemory = true
################################
Train = {

Просмотреть файл

@ -32,8 +32,6 @@ num_channels = 3 # RGB
num_classes = 1000
model_name = "VGG16.model"
cntk.cntk_py.enable_hyper_memory_compress()
# Create a minibatch source.
def create_image_mb_source(map_file, is_training, total_number_of_samples):
if not os.path.exists(map_file):

Просмотреть файл

@ -32,8 +32,6 @@ num_channels = 3 # RGB
num_classes = 1000
model_name = "VGG19.model"
cntk.cntk_py.enable_hyper_memory_compress()
# Create a minibatch source.
def create_image_mb_source(map_file, is_training, total_number_of_samples):
if not os.path.exists(map_file):

Просмотреть файл

@ -7,6 +7,9 @@
from __future__ import print_function
import zipfile
import os
from sys import platform
import shutil
try:
from urllib.request import urlretrieve
except ImportError:
@ -26,6 +29,15 @@ def download_grocery_data():
print('Extracting ' + filename + '...')
with zipfile.ZipFile(filename) as myzip:
myzip.extractall(dataset_folder)
if platform != "win32":
testfile = os.path.join(dataset_folder, "grocery", "test.txt")
unixfile = os.path.join(dataset_folder, "grocery", "test_unix.txt")
out = open(unixfile, 'w')
with open(testfile) as f:
for line in f:
out.write(line.replace('\\', '/'))
out.close()
shutil.move(unixfile, testfile)
finally:
os.remove(filename)
print('Done.')
@ -34,4 +46,4 @@ def download_grocery_data():
if __name__ == "__main__":
download_grocery_data()

Просмотреть файл

@ -9,18 +9,15 @@ import os
import numpy as np
from cntk import load_model, graph
from cntk.ops import combine
from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs
from cntk import graph
from cntk.graph import get_node_outputs
import cntk.io.transforms as xforms
def create_mb_source(image_height, image_width, num_channels, map_file):
transforms = [ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
image_source = ImageDeserializer(map_file)
image_source.ignore_labels()
image_source.map_features('features', transforms)
return MinibatchSource(image_source, randomize=False)
transforms = [xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
return MinibatchSource(ImageDeserializer(map_file,
StreamDefs(features=StreamDef(field='image', transforms=transforms))), # first column in map file is referred to as 'image'
randomize=False) # second column is labels and is ignored
def eval_and_write(model_file, node_name, output_file, minibatch_source, num_objects):

Просмотреть файл

@ -12,7 +12,8 @@ from cntk.device import set_default_device, gpu
from cntk import load_model, Trainer, UnitType
from cntk.layers import Placeholder, Constant
from cntk.graph import find_by_name, get_node_outputs
from cntk.io import MinibatchSource, ImageDeserializer
from cntk.io import MinibatchSource, ImageDeserializer, StreamDefs, StreamDef
import cntk.io.transforms as xforms
from cntk.layers import Dense
from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_schedule
from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, combine, softmax
@ -58,11 +59,11 @@ _num_classes = 102
# Creates a minibatch source for training or testing
def create_mb_source(map_file, image_width, image_height, num_channels, num_classes, randomize=True):
transforms = [ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
image_source = ImageDeserializer(map_file)
image_source.map_features(features_stream_name, transforms)
image_source.map_labels(label_stream_name, num_classes)
return MinibatchSource(image_source, randomize=randomize)
transforms = [xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
features =StreamDef(field='image', transforms=transforms),
labels =StreamDef(field='label', shape=num_classes))),
randomize=randomize)
# Creates the network model for transfer learning

Просмотреть файл

@ -18,6 +18,10 @@ sys.path.append(os.path.join(base_folder, "..", "DataSets", "Animals"))
from install_animals import download_animals_data
download_animals_data()
sys.path.append(os.path.join(base_folder, "..", "DataSets", "Grocery"))
from install_grocery import download_grocery_data
download_grocery_data()
sys.path.append(os.path.join(base_folder, "..", "PretrainedModels"))
from models_util import download_model_by_name
download_model_by_name("ResNet_18")

Просмотреть файл

@ -19,8 +19,7 @@ from cntk.ops import cross_entropy_with_softmax, classification_error, splice, r
# variables and stuff #
########################
cntk_dir = os.path.dirname(os.path.abspath(__file__)) + "/../../../.." # data resides in the CNTK folder
data_dir = cntk_dir + "/Examples/LanguageUnderstanding/ATIS/Data" # under Examples/LanguageUnderstanding/ATIS
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "Data")
vocab_size = 943 ; num_labels = 129 ; num_intents = 26 # number of words in vocab, slot labels, and intent labels
model_dir = "./Models"

Просмотреть файл

@ -139,7 +139,7 @@ def create_inputs(vocab_dim):
return input_sequence, label_sequence
# Creates and trains a character-level language model
def train_lm(training_file, max_num_minibatches):
def train_lm(training_file, epochs, max_num_minibatches):
# load the data and vocab
data, char_to_ix, ix_to_char, data_size, vocab_dim = load_data_and_vocab(training_file)
@ -168,46 +168,34 @@ def train_lm(training_file, max_num_minibatches):
trainer = Trainer(z, (ce, errs), learner)
sample_freq = 1000
epochs = 50
minibatches_per_epoch = int((data_size / minibatch_size))
minibatches = min(epochs * minibatches_per_epoch, max_num_minibatches)
minibatches_per_epoch = min(data_size // minibatch_size, max_num_minibatches // epochs)
# print out some useful training information
log_number_of_parameters(z) ; print()
progress_printer = ProgressPrinter(freq=100, tag='Training')
log_number_of_parameters(z)
print ("Running %d epochs with %d minibatches per epoch" % (epochs, minibatches_per_epoch))
print()
e = 0
p = 0
for i in range(0, minibatches):
if p + minibatch_size+1 >= data_size:
p = 0
e += 1
model_filename = "models/shakespeare_epoch%d.dnn" % e
z.save(model_filename)
print("Saved model to '%s'" % model_filename)
# get the data
features, labels = get_data(p, minibatch_size, data, char_to_ix, vocab_dim)
progress_printer = ProgressPrinter(freq=100, tag='Training')
for e in range(0, epochs):
# Specify the mapping of input variables in the model to actual minibatch data to be trained with
# If it's the start of the data, we specify that we are looking at a new sequence (True)
mask = [False]
if p == 0:
mask = [True]
arguments = ({input_sequence : features, label_sequence : labels}, mask)
trainer.train_minibatch(arguments)
mask = [True]
for b in range(0, minibatches_per_epoch):
# get the data
features, labels = get_data(b, minibatch_size, data, char_to_ix, vocab_dim)
arguments = ({input_sequence : features, label_sequence : labels}, mask)
mask = [False]
trainer.train_minibatch(arguments)
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
if i % sample_freq == 0:
print(sample(z, ix_to_char, vocab_dim, char_to_ix))
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
global_minibatch = e*minibatches_per_epoch + b
if global_minibatch % sample_freq == 0:
print(sample(z, ix_to_char, vocab_dim, char_to_ix))
p += minibatch_size
# Do a final save of the model
model_filename = "models/shakespeare_epoch%d.dnn" % e
z.save(model_filename)
model_filename = "models/shakespeare_epoch%d.dnn" % (e+1)
z.save_model(model_filename)
print("Saved model to '%s'" % model_filename)
def load_and_sample(model_filename, vocab_filename, prime_text='', use_hardmax=False, length=1000, temperature=1.0):
@ -223,13 +211,13 @@ def load_and_sample(model_filename, vocab_filename, prime_text='', use_hardmax=F
return sample(model, ix_to_char, len(chars), char_to_ix, prime_text=prime_text, use_hardmax=use_hardmax, length=length, temperature=temperature)
def train_and_eval_char_rnn(max_num_minibatches=sys.maxsize):
# train the LM
train_lm("data/tinyshakespeare.txt", max_num_minibatches)
def train_and_eval_char_rnn(epochs=50, max_num_minibatches=sys.maxsize):
# train the LM
train_lm("data/tinyshakespeare.txt", epochs, max_num_minibatches)
# load and sample
text = "T"
return load_and_sample("models/shakespeare_epoch0.dnn", "data/tinyshakespeare.txt.vocab", prime_text=text, use_hardmax=False, length=100, temperature=0.95)
return load_and_sample("models/shakespeare_epoch%d.dnn" % (epochs), "data/tinyshakespeare.txt.vocab", prime_text=text, use_hardmax=False, length=100, temperature=0.95)
if __name__=='__main__':
# Specify the target device to be used for computing, if you do not want to

Просмотреть файл

@ -23,7 +23,7 @@ from _cntk_py import set_computation_network_trace_level
# Paths relative to current python file.
abs_path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.join(abs_path, "..", "..", "Datasets", "UCF11")
data_path = os.path.join(abs_path, "..", "..", "DataSets", "UCF11")
model_path = os.path.join(abs_path, "Models")
# Define the reader for both training and evaluation action.
@ -194,14 +194,14 @@ def conv3d_ucf11(train_reader, test_reader, max_epochs=30):
lr_per_sample = [0.01]*10+[0.001]*10+[0.0001]
lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
momentum_time_constant = 4096
mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant, epoch_size=epoch_size)
mm_schedule = momentum_as_time_constant_schedule([momentum_time_constant], epoch_size=epoch_size)
# Instantiate the trainer object to drive the model training
learner = momentum_sgd(z.parameters, lr_schedule, mm_schedule, True)
trainer = Trainer(z, (ce, pe), learner)
log_number_of_parameters(z) ; print()
progress_printer = ProgressPrinter(tag='Training')
progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
# Get minibatches of images to train with and perform model training
for epoch in range(max_epochs): # loop over epochs

Просмотреть файл

@ -77,7 +77,10 @@ endif
# The mpic++ wrapper only adds MPI specific flags to the g++ command line.
# The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
ifneq ($(HAS_MPI),0)
CXX = $(MPI_PATH)/bin/mpic++
endif
SSE_FLAGS = -msse4.1 -mssse3
PROTOC = $(PROTOBUF_PATH)/bin/protoc
@ -90,8 +93,8 @@ SOURCEDIR:= Source
INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2LibraryDll/API CNTKv2LibraryDll/proto Math CNTK ActionsLib ComputationNetworkLib SGDLib SequenceTrainingLib CNTK/BrainScript Readers/ReaderLib PerformanceProfilerDll)
INCLUDEPATH+=$(PROTOBUF_PATH)/include
# COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
CPPFLAGS:=
COMMON_FLAGS:= -DHAS_MPI=$(HAS_MPI) -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
CPPFLAGS:=
CXXFLAGS:= $(SSE_FLAGS) -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
LIBPATH:=
LIBS_LIST:=
@ -270,7 +273,7 @@ RPATH=-Wl,-rpath,
# Build info
########################################
BUILDINFO:= $(SOURCEDIR)/CNTK/buildinfo.h
BUILDINFO:= $(SOURCEDIR)/CNTKv2LibraryDll/buildinfo.h
GENBUILD:=Tools/generate_build_info
BUILDINFO_OUTPUT := $(shell $(GENBUILD) $(BUILD_TOP)/Config.make && echo Success)
@ -579,9 +582,16 @@ $(EVAL_EXTENDED_CLIENT): $(EVAL_EXTENDED_CLIENT_OBJ) | $(EVAL_LIB) $(READER_LIBS
########################################
CNTKLIBRARY_CPP_EVAL_EXAMPLES:=$(BINDIR)/CNTKLibraryCPPEvalExamples
#ifdef CUDA_PATH
CNTKLIBRARY_CPP_EVAL_EXAMPLES_SRC=\
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.cpp \
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalExamples/EvalMultithreads.cpp
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalGPUExamples/CNTKLibraryCPPEvalGPUExamples.cpp\
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/EvalMultithreads.cpp
#else
CNTKLIBRARY_CPP_EVAL_EXAMPLES_SRC=\
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/CNTKLibraryCPPEvalCPUOnlyExamples.cpp\
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/EvalMultithreads.cpp
#endif
CNTKLIBRARY_CPP_EVAL_EXAMPLES_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_CPP_EVAL_EXAMPLES_SRC))
@ -594,6 +604,26 @@ $(CNTKLIBRARY_CPP_EVAL_EXAMPLES): $(CNTKLIBRARY_CPP_EVAL_EXAMPLES_OBJ) | $(CNTKL
@echo building $(CNTKLIBRARY_CPP_EVAL_EXAMPLES) for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) $(L_READER_LIBS)
########################################
# Eval V2 Sample test
########################################
CNTKLIBRARY_CPP_EVAL_TEST:=$(BINDIR)/CNTKLibraryCPPEvalExamplesTest
CNTKLIBRARY_CPP_EVAL_TEST_SRC=\
$(SOURCEDIR)/../Tests/EndToEndTests/EvalClientTests/CNTKLibraryCPPEvalExamplesTest/CNTKLibraryCPPEvalExamplesTest.cpp\
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/EvalMultithreads.cpp\
$(SOURCEDIR)/../Tests/EndToEndTests/CNTKv2Library/Common/Common.cpp
CNTKLIBRARY_CPP_EVAL_TEST_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_CPP_EVAL_TEST_SRC))
ALL+=$(CNTKLIBRARY_CPP_EVAL_TEST)
SRC+=$(CNTKLIBRARY_CPP_EVAL_TEST_SRC)
$(CNTKLIBRARY_CPP_EVAL_TEST): $(CNTKLIBRARY_CPP_EVAL_TEST_OBJ) | $(CNTKLIBRARY_LIB) $(READER_LIBS)
@mkdir -p $(dir $@)
@echo building $(CNTKLIBRARY_CPP_EVAL_TEST) for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) $(L_READER_LIBS)
########################################
# HTKMLFReader plugin
########################################

Просмотреть файл

@ -256,9 +256,10 @@ void DoWriteOutput(const ConfigParameters& config)
else if (config.Exists("outputPath"))
{
wstring outputPath = config(L"outputPath");
bool writeSequenceKey = config(L"writeSequenceKey", false);
WriteFormattingOptions formattingOptions(config);
bool nodeUnitTest = config(L"nodeUnitTest", "false");
writer.WriteOutput(testDataReader, mbSize[0], outputPath, outputNodeNamesVector, formattingOptions, epochSize, nodeUnitTest);
writer.WriteOutput(testDataReader, mbSize[0], outputPath, outputNodeNamesVector, formattingOptions, epochSize, nodeUnitTest, writeSequenceKey);
}
else
InvalidArgument("write command: You must specify either 'writer'or 'outputPath'");

Просмотреть файл

@ -164,12 +164,15 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
else if (EqualInsensitive(nodeType, OperationNameOf(EqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterEqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(ForwardBackwardNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LabelsToGraphNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LessEqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LessNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(NotEqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(ClipNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(ConvolutionNode), L"Convolve")) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(CropNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(PassNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceNode), L"CosDist")) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceWithNegativeSamplesNode), L"CosWithNegSamples")) ret = true;

Просмотреть файл

@ -59,12 +59,19 @@ shared_ptr<C> CreateObject(const ScriptableObjects::IConfigRecord& config, const
template <class C>
shared_ptr<C> CreateObject(const ConfigParameters& config, const wchar_t* id)
{
ConfigParameters readerConfig(config(id));
if (!readerConfig.ExistsCurrent("traceLevel")) // do not overwrite "traceLevel" if it's already present
ConfigParameters objConfig(config(id));
const auto& readerType = string(objConfig("readerType", ""));
if (objConfig.ExistsCurrent("traceLevel") || // do not overwrite "traceLevel" if it's already present
AreEqualIgnoreCase(readerType, "CNTKTextFormatReader") || // do not overwrite "traceLevel" when creating a CTF reader
AreEqualIgnoreCase(readerType, "CNTKBinaryReader")) // do not overwrite "traceLevel" when creating a binary reader
{
readerConfig.Insert("traceLevel", config(L"traceLevel", "0")); // TODO: fix this by adding it to all config blocks. Easy to fix in BS as 'config with [ traceLevel = 0 ]'.
return make_shared<C>(objConfig);
}
return make_shared<C>(readerConfig); // old CNTK config specifies a dictionary which then must be explicitly instantiated
// If the config does not specify a 'traceLevel', the following line
// will insert it with the value of 0.
objConfig.Insert("traceLevel", config(L"traceLevel", "0")); // TODO: fix this by adding it to all config blocks. Easy to fix in BS as 'config with [ traceLevel = 0 ]'.
return make_shared<C>(objConfig); // old CNTK config specifies a dictionary which then must be explicitly instantiated
}
template <class ConfigRecordType, typename ElemType>

Просмотреть файл

@ -577,6 +577,9 @@ Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag=
RowSlice(beginIndex, numRows, input, tag='') = Slice(beginIndex, beginIndex + numRows, input, axis = 1)
RowRepeat(input, numRepeats, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = _AsNodes (input) /*plus the function args*/ ]
RowStack(inputs, axis=1, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]
EditDistanceError(leftInput, rightInput, subPen=0.0, delPen=0.0, insPen=0.0, squashInputs=false, tokensToIgnore=[||], tag='') = new ComputationNode [ operation = 'EditDistanceError' ; inputs = _AsNodes (leftInput : rightInput) /*plus the function args*/ ]
ForwardBackward(graph, features, blankTokenId, delayConstraint=-1, tag='') = new ComputationNode [ operation = 'ForwardBackward' ; inputs = _AsNodes (graph : features) /*plus the function args*/ ]
LabelsToGraph(labels, tag='') = new ComputationNode [ operation = 'LabelsToGraph' ; inputs = _AsNodes (labels) /*plus the function args*/ ]
Slice(beginIndex, endIndex, input, axis=1, tag='') =
if axis < 0 then [ # time axis: specify -1
beginFlags = if beginIndex > 0 then BS.Boolean.Not (BS.Loop.IsFirstN (beginIndex, input)) else BS.Loop.IsLastN (-beginIndex, input)

Просмотреть файл

@ -36,6 +36,7 @@
#include "BrainScriptEvaluator.h"
#include "BrainScriptParser.h"
#include "PerformanceProfiler.h"
#include "CNTKLibrary.h"
#include <string>
#include <chrono>
@ -252,9 +253,6 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
ProgressTracing::SetStepOffset(fullEpochsOffset); // this is the epoch number that SGD will log relative to
}
if (Globals::ShouldEnableHyperCompressMemory())
Matrix<ElemType>::UseCachedResizeOrNot(true);
// determine the action to perform, and do it
for (int j = 0; j < action.size(); j++)
{
@ -372,55 +370,6 @@ std::string TimeDateStamp()
return buf;
}
void PrintBuiltInfo()
{
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "Build info: \n\n");
LOGPRINTF(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
LOGPRINTF(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
#ifdef _BUILDTYPE_
LOGPRINTF(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
#endif
#ifdef _BUILDTARGET_
LOGPRINTF(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
#endif
#ifdef _WITH_1BITSGD_
LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
#endif
#ifdef _WITH_ASGD_
LOGPRINTF(stderr, "\t\tWith ASGD: %s\n", _WITH_ASGD_);
#endif
#ifdef _MATHLIB_
LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
#endif
#ifdef _CUDA_PATH_
LOGPRINTF(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
#endif
#ifdef _CUB_PATH_
LOGPRINTF(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
#endif
#ifdef _CUDNN_PATH_
LOGPRINTF(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
#endif
#ifdef _GIT_EXIST
LOGPRINTF(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
LOGPRINTF(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
#endif
#ifdef _BUILDER_
LOGPRINTF(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
#endif
#ifdef _BUILDPATH_
LOGPRINTF(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
#endif
#ifdef _MPI_NAME_
LOGPRINTF(stderr, "\t\tMPI distribution: %s\n", _MPI_NAME_);
#endif
#ifdef _MPI_VERSION_
LOGPRINTF(stderr, "\t\tMPI version: %s\n", _MPI_VERSION_);
#endif
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
}
void PrintUsageInfo()
{
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
@ -585,7 +534,6 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
Globals::SetShareNodeValueMatrices(config(L"shareNodeValueMatrices", true));
Globals::SetGradientAccumulationOptimization(config(L"optimizeGradientAccumulation", true));
Globals::SetHyperCompressMemory(config(L"hyperCompressMemory", false));
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
@ -598,7 +546,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
RedirectStdErr(logpath);
LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
PrintBuiltInfo();
::CNTK::PrintBuiltInfo();
}
// echo gpu info to log
@ -666,7 +614,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
static void PrintBanner(int argc, wchar_t* argv[], const string& timestamp)
{
fprintf(stderr, "CNTK 2.0.beta11.0+ (");
fprintf(stderr, "CNTK 2.0.beta11.0 (");
#ifdef _GIT_EXIST
fprintf(stderr, "%s %.6s, ", _BUILDBRANCH_, _BUILDSHA1_);
#endif
@ -729,7 +677,6 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
Globals::SetShareNodeValueMatrices(config(L"shareNodeValueMatrices", true));
Globals::SetGradientAccumulationOptimization(config(L"optimizeGradientAccumulation", true));
Globals::SetHyperCompressMemory(config(L"hyperCompressMemory", false));
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
@ -764,7 +711,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
}
// full config info
PrintBuiltInfo();
::CNTK::PrintBuiltInfo();
PrintGpuInfo();
#ifdef _DEBUG
@ -857,7 +804,7 @@ int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper th
{
if (argc <= 1)
{
PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
::CNTK::PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
LOGPRINTF(stderr, "No command-line argument given.\n");
PrintUsageInfo();
fflush(stderr);

Просмотреть файл

@ -85,7 +85,8 @@
<StackReserveSize>100000000</StackReserveSize>
</Link>
<PreBuildEvent>
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
<Command>
</Command>
</PreBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -113,7 +114,8 @@
<StackReserveSize>100000000</StackReserveSize>
</Link>
<PreBuildEvent>
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
<Command>
</Command>
</PreBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(CpuOnlyBuild)">

Просмотреть файл

@ -396,8 +396,16 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
MELProperty prop = melPropNull;
#if 1 // legacy
// legacy names for some properties
if (EqualInsensitive(propName, "finalCriterion", "Criteria")) propName = "criterion";
else if (EqualInsensitive(propName, "eval")) propName = "evaluation";
if (EqualInsensitive(propName, "finalCriterion", "Criteria"))
{
propName = "criterion";
prop = melPropFinalCriterion;
}
else if (EqualInsensitive(propName, "eval"))
{
propName = "evaluation";
prop = melPropEvaluation;
}
// legacy property that now works differently
else if (EqualInsensitive(propName, "needGradient", "needsGradient") || EqualInsensitive(propName, "computeGradient"))
prop = melPropParameterUpdateRequired; // for backward compatibility

Просмотреть файл

@ -1395,6 +1395,18 @@ namespace CNTK
CNTK_API void Add(const Dictionary& other);
void Add(const std::wstring& key, const DictionaryValue& value)
{
operator[](key.c_str()) = value;
}
template<typename... Args>
void Add(const std::wstring& key, const DictionaryValue& value, Args... args)
{
Add(key, value); //insert one
Add(args...); //recurse
}
CNTK_API bool operator==(const Dictionary& other) const;
CNTK_API bool operator!=(const Dictionary& other) const;
@ -1634,6 +1646,8 @@ private:
Variable CompositePreservingCopy(const std::shared_ptr<const Function>& composite) const;
Variable NonCompositePreservingCopy() const;
private:
#ifdef SWIGCSHARP
public:
@ -2735,7 +2749,7 @@ namespace CNTK
///
/// Returns the root of the Function graph underlying this block Function.
/// Throws an exception ff this is not a block Function
/// Throws an exception of this is not a block Function
///
CNTK_API FunctionPtr BlockRoot() const;
@ -4430,6 +4444,20 @@ namespace CNTK
std::wstring m_streamAlias;
};
struct HTKFeatureConfiguration
{
HTKFeatureConfiguration(const std::wstring& streamName, const std::wstring& scp, size_t dim, size_t left, size_t right, bool broadcast)
: m_streamName(streamName), m_dim(dim), m_scp(scp), m_left(left), m_right(right), m_broadcast(broadcast)
{}
std::wstring m_streamName;
std::wstring m_scp;
size_t m_dim;
size_t m_left;
size_t m_right;
bool m_broadcast;
};
///
/// Instantiate the CNTK built-in text format minibatch source
///
@ -4475,6 +4503,56 @@ namespace CNTK
return CreateCompositeMinibatchSource(minibatchSourceConfiguration);
}
typedef Dictionary ImageTransform;
///
/// Create a crop transform with the specified options to be used with a reader
///
CNTK_API ImageTransform ReaderCrop(const wchar_t* cropType = L"center",
int cropSize = 0, float sideRatio = 0.0f, float areaRatio = 0.0f,
float aspectRatio = 1.0f, const wchar_t* jitterType = L"none");
///
/// Create a scale transform with the specified options to be used with a reader
///
CNTK_API ImageTransform ReaderScale(int width,
int height, int channels, const wchar_t* interpolations = L"linear",
const wchar_t* scaleMode = L"fill", int padValue = -1);
///
/// Create a mean subtraction transform with the specified options to be used with a reader
///
CNTK_API ImageTransform ReaderMean(const wchar_t* meanFile);
///
/// Create a color transform with the specified options to be used with a reader
///
CNTK_API ImageTransform ReaderColor(float brightnessRadius = 0.0f,
float contrastRadius = 0.0f, float saturationRadius = 0.0f);
typedef Dictionary Deserializer;
///
/// Create an ImageDeserializer with the specified options
///
CNTK_API Deserializer ImageDeserializer(const std::wstring& fileName, const std::wstring& labelStreamName, size_t numLabels, const std::wstring& imageStreamName, const std::vector<ImageTransform>& transforms = {});
///
/// Create an CTFDeserializer with the specified options
///
CNTK_API Deserializer CTFDeserializer(const std::wstring& fileName, const std::vector<StreamConfiguration>& streams);
///
/// Create an HTKFeatureDeserializer with the specified options
///
CNTK_API Deserializer HTKFeatureDeserializer(const std::vector<HTKFeatureConfiguration>& streams);
///
/// Create an HTKMLFDeserializer with the specified options
///
CNTK_API Deserializer HTKMLFDeserializer(const std::wstring& streamName, const std::wstring& labelMappingFile, size_t dimension, const std::vector<std::wstring>& mlfFiles);
///
/// Compute the per dimension means and variances for each of the specified streams using data from the specified minibatchSource.
///
@ -4769,6 +4847,9 @@ namespace CNTK
bool keepExistingCheckpoints = false,
size_t maxNumberOfTrainingSamples = std::numeric_limits<size_t>::max(),
size_t progressFrequency = std::numeric_limits<size_t>::max());
CNTK_API void PrintBuiltInfo();
}

Просмотреть файл

@ -250,9 +250,6 @@ namespace CNTK
CNTK_API void EnableForwardValuesSharing();
CNTK_API void DisableForwardValuesSharing();
CNTK_API void EnableHyperMemoryCompress();
CNTK_API void DisableHyperMemoryCompress();
CNTK_API void EnableGradientAccumulationOptimization();
CNTK_API void DisableGradientAccumulationOptimization();

Просмотреть файл

@ -144,6 +144,8 @@ namespace CNTK
opType = PrimitiveOpType::Sin;
else if (node->OperationName() == OperationNameOf(PassNode))
opType = PrimitiveOpType::Pass;
else if (node->OperationName() == OperationNameOf(LabelsToGraphNode))
opType = PrimitiveOpType::LabelsToGraph;
else if (node->OperationName() == OperationNameOf(RectifiedLinearNode))
opType = PrimitiveOpType::ReLU;
else if (node->OperationName() == OperationNameOf(ExpNode))
@ -450,7 +452,7 @@ namespace CNTK
primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameDeletionPenalty] = edNode->DeletionPenalty();
primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameSubstitutionPenalty] = edNode->SubstitutionPenalty();
primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameSquashInputs] = edNode->SquashInputs();
primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameSamplesToIgnore] = AsDictionaryValueVector(edNode->SamplesToIgnore());
primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameTokensToIgnore] = AsDictionaryValueVector(edNode->TokensToIgnore());
opType = PrimitiveOpType::EditDistanceError;
}

Просмотреть файл

@ -106,6 +106,12 @@
<DelayLoadDLLs>Math.dll; msmpi.dll; PerformanceProfilerDll.dll </DelayLoadDLLs>
<OptimizeReferences Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">false</OptimizeReferences>
</Link>
<PreBuildEvent>
<Command Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
</PreBuildEvent>
<PreBuildEvent>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
</PreBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(GpuBuild)">
<ClCompile>
@ -118,6 +124,15 @@
<Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" "$(TargetDir)"</Command>
<Message>Copying NVidia GDK extension DLL to target folder</Message>
</PostBuildEvent>
<PreBuildEvent>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
</PreBuildEvent>
<PreBuildEvent>
<Command Condition="'$(Configuration)|$(Platform)'=='Release_NoOpt|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
</PreBuildEvent>
<PreBuildEvent>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
</PreBuildEvent>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="API\CNTKLibrary.h" />

Просмотреть файл

@ -17,8 +17,11 @@
#include "PerformanceProfiler.h"
#include "MPIWrapper.h"
#include "Basics.h"
#include "ProgressTracing.h"
#include "buildinfo.h"
extern bool g_shareNodeValueMatrices;
using namespace Microsoft::MSR::CNTK;
namespace CNTK
{
@ -84,16 +87,6 @@ namespace CNTK
Microsoft::MSR::CNTK::Globals::SetShareNodeValueMatrices(/* enable = */ false);
}
void EnableHyperMemoryCompress()
{
Microsoft::MSR::CNTK::Globals::SetHyperCompressMemory(/* enable = */ true);
}
void DisableHyperMemoryCompress()
{
Microsoft::MSR::CNTK::Globals::SetHyperCompressMemory(/* enable = */ false);
}
void EnableGradientAccumulationOptimization()
{
Microsoft::MSR::CNTK::Globals::SetGradientAccumulationOptimization(/* enable = */ true);
@ -617,6 +610,56 @@ namespace CNTK
va_end(args);
}
void PrintBuiltInfo()
{
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "Build info: \n\n");
LOGPRINTF(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
LOGPRINTF(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
#ifdef _BUILDTYPE_
LOGPRINTF(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
#endif
#ifdef _BUILDTARGET_
LOGPRINTF(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
#endif
#ifdef _WITH_1BITSGD_
LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
#endif
#ifdef _WITH_ASGD_
LOGPRINTF(stderr, "\t\tWith ASGD: %s\n", _WITH_ASGD_);
#endif
#ifdef _MATHLIB_
LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
#endif
#ifdef _CUDA_PATH_
LOGPRINTF(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
#endif
#ifdef _CUB_PATH_
LOGPRINTF(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
#endif
#ifdef _CUDNN_PATH_
LOGPRINTF(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
#endif
#ifdef _GIT_EXIST
LOGPRINTF(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
LOGPRINTF(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
#endif
#ifdef _BUILDER_
LOGPRINTF(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
#endif
#ifdef _BUILDPATH_
LOGPRINTF(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
#endif
#ifdef _MPI_NAME_
LOGPRINTF(stderr, "\t\tMPI distribution: %s\n", _MPI_NAME_);
#endif
#ifdef _MPI_VERSION_
LOGPRINTF(stderr, "\t\tMPI version: %s\n", _MPI_VERSION_);
#endif
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
}
template CNTK_API __declspec_noreturn void ThrowFormatted<std::runtime_error>(const char* format, ...);
template CNTK_API __declspec_noreturn void ThrowFormatted<std::logic_error>(const char* format, ...);
template CNTK_API __declspec_noreturn void ThrowFormatted<std::invalid_argument>(const char* format, ...);

Просмотреть файл

@ -721,8 +721,8 @@ namespace CNTK
auto delPen = functionConfig[PrimitiveFunction::AttributeNameDeletionPenalty].Value<float>();
auto insPen = functionConfig[PrimitiveFunction::AttributeNameInsertionPenalty].Value<float>();
auto squashInputs = functionConfig[PrimitiveFunction::AttributeNameSquashInputs].Value<bool>();
auto samplesToIgnore = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNameSamplesToIgnore].Value<std::vector<DictionaryValue>>());
computationNodePtr = New<EditDistanceErrorNode<ElementType>>(network->GetDeviceId(), subPen, delPen, insPen, squashInputs, samplesToIgnore, internalNodeName);
auto tokensToIgnore = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNameTokensToIgnore].Value<std::vector<DictionaryValue>>());
computationNodePtr = New<EditDistanceErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore);
break;
}
case PrimitiveOpType::LambdaRank:
@ -813,6 +813,9 @@ namespace CNTK
case PrimitiveOpType::Pass:
computationNodePtr = New<PassNode<ElementType>>(network->GetDeviceId(), internalNodeName);
break;
case PrimitiveOpType::LabelsToGraph:
computationNodePtr = New<LabelsToGraphNode<ElementType>>(network->GetDeviceId(), internalNodeName);
break;
default:
CNTK::LogicError("Specified op %S not yet supported", PrimitiveOpTypeName(op).c_str());
break;
@ -932,6 +935,18 @@ namespace CNTK
return computationNodePtr;
}
std::unordered_set<Variable> CompositeFunction::NonOwnerPreservingCopy(const std::unordered_set<Variable>& outputs)
{
std::unordered_set<Variable> result;
for (auto& o : outputs)
{
Variable sanitized = o.NonCompositePreservingCopy();
result.insert(sanitized);
}
return result;
}
template <typename ElementType>
ComputationNetworkPtr CompositeFunction::GetComputationNetwork(const DeviceDescriptor& device,
const std::unordered_set<Variable>& backpropRoots,
@ -941,7 +956,7 @@ namespace CNTK
{
if (m_computationNetwork != nullptr)
{
// TODO: We should either invalidate and readapt the network if he backpropRoots change compared to what was specified when the network
// TODO: We should either invalidate and readapt the network if the backpropRoots change compared to what was specified when the network
// was last constructed, to just recreate a new network.
// For now just disallow changing the backpropRoots after the network is created
if (!backpropRoots.empty() && (m_currentBackpropRoots != backpropRoots))
@ -966,7 +981,7 @@ namespace CNTK
InvalidArgument("Function::Forward: Only inputs of a Function can be excluded from gradient computation");
}
m_inputsExcludedFromGradientComputation = inputsToExcludeGradientsFor;
m_inputsExcludedFromGradientComputation = NonOwnerPreservingCopy(inputsToExcludeGradientsFor);
ComputationNetworkBuilder<ElementType> builder(*m_computationNetwork);
@ -1023,7 +1038,7 @@ namespace CNTK
}
}
m_currentBackpropRoots = backpropRoots;
m_currentBackpropRoots = NonOwnerPreservingCopy(backpropRoots);
// In case of recurrence, the inputs of some of the ComputationNodes are not attached due to cycles.
// Now attach those after we have created all ComputationNodes in the network
@ -1317,10 +1332,12 @@ namespace CNTK
{
if (m_perOutputVarArgumentDependencies.find(output) == m_perOutputVarArgumentDependencies.end())
{
if (output.IsOutput())
m_perOutputVarArgumentDependencies[output] = AsComposite(output.Owner())->Arguments();
auto sanitizedOutput = output.NonCompositePreservingCopy();
if (sanitizedOutput.IsOutput())
m_perOutputVarArgumentDependencies[sanitizedOutput] = AsComposite(sanitizedOutput.Owner())->Arguments();
else
m_perOutputVarArgumentDependencies[output] = { output };
m_perOutputVarArgumentDependencies[sanitizedOutput] = { sanitizedOutput };
}
return m_perOutputVarArgumentDependencies[output];
@ -1381,12 +1398,13 @@ namespace CNTK
std::unordered_set<Variable> functionOutputs(m_outputs.begin(), m_outputs.end());
std::vector<ComputationNodeBasePtr> outputsToEvaluate;
std::unordered_set<Variable> requiredArguments;
for (auto outputVarValuePair : outputs)
for (auto outputVariable : requestedOutputVariables)
{
auto& requiredArgumentsForCurrentOutput = GetArgumentDependencies(outputVarValuePair.first);
auto& requiredArgumentsForCurrentOutput = GetArgumentDependencies(outputVariable);
requiredArguments.insert(requiredArgumentsForCurrentOutput.begin(), requiredArgumentsForCurrentOutput.end());
auto outputComputationNode = m_variableToNodeMap.at(outputVarValuePair.first);
auto outputComputationNode = m_variableToNodeMap.at(outputVariable);
outputsToEvaluate.push_back(outputComputationNode);
}

Просмотреть файл

@ -33,6 +33,13 @@ namespace CNTK
class CompositeFunction;
typedef std::shared_ptr<CompositeFunction> CompositeFunctionPtr;
///
/// Represents a symbolic computation with zero or more input arguments and one or more outputs.
/// Opposed to primitive functions, a composite function is composed of other Function instances whose inputs and outputs are wired together.
/// CompositeFunction is also responsible for breaking the loop in case of cyclic graphs - it stores the pointers for to the child primitive
/// functions and controls their lifetime.
/// CompositeFunction class inherits thus from Function.
///
class CompositeFunction final : public Function
{
friend class Function;
@ -258,6 +265,9 @@ namespace CNTK
void GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs);
void GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients);
// Remove cyclic references for composite nodes
static std::unordered_set<Variable> NonOwnerPreservingCopy(const std::unordered_set<Variable>& outputs);
const std::vector<Variable>& GetArgumentDependencies(const Variable& output);
std::unordered_map<Variable, uint64_t> GetCurrentBackpropRootsTimeStamps() const;

Просмотреть файл

@ -340,16 +340,16 @@ namespace CNTK
if (dataType == DataType::Float)
{
if (inputData == outputData)
m_mpi->AllReduceAsync<float>(static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
m_mpi->AllReduceAsync(static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
else
m_mpi->AllReduceAsync<float>(static_cast<float*>(inputData), static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
m_mpi->AllReduceAsync(static_cast<float*>(inputData), static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
}
else if (dataType == DataType::Double)
{
if (inputData == outputData)
m_mpi->AllReduceAsync<double>(static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
m_mpi->AllReduceAsync(static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
else
m_mpi->AllReduceAsync<double>(static_cast<double*>(inputData), static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
m_mpi->AllReduceAsync(static_cast<double*>(inputData), static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
}
else
LogicError("Unknown DataType");

Просмотреть файл

@ -1078,14 +1078,14 @@ namespace CNTK
}
}
FunctionPtr EditDistanceError(const Variable& prediction, const Variable& labels, float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& samplesToIgnore, const std::wstring& name)
FunctionPtr EditDistanceError(const Variable& prediction, const Variable& labels, float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& tokensToIgnore, const std::wstring& name)
{
auto additionalProperties = Dictionary();
additionalProperties[PrimitiveFunction::AttributeNameSubstitutionPenalty] = subPen;
additionalProperties[PrimitiveFunction::AttributeNameDeletionPenalty] = delPen;
additionalProperties[PrimitiveFunction::AttributeNameInsertionPenalty] = insPen;
additionalProperties[PrimitiveFunction::AttributeNameSquashInputs] = squashInputs;
additionalProperties[PrimitiveFunction::AttributeNameSamplesToIgnore] = AsDictionaryValueVector(samplesToIgnore);
additionalProperties[PrimitiveFunction::AttributeNameTokensToIgnore] = AsDictionaryValueVector(tokensToIgnore);
return BinaryOp(PrimitiveOpType::EditDistanceError, prediction, labels, std::move(additionalProperties), name);
}

Просмотреть файл

@ -349,4 +349,117 @@ namespace CNTK
m_epochEndReached = false;
m_prevMinibatchSize = 0;
}
/* static */ ImageTransform ReaderCrop(const wchar_t* cropType,
int cropSize, float sideRatio, float areaRatio,
float aspectRatio, const wchar_t* jitterType)
{
ImageTransform crop;
crop.Add(L"type", L"Crop",
L"cropType", cropType,
L"cropSize", cropSize,
L"sideRatio", sideRatio,
L"areaRatio", areaRatio,
L"aspectRatio", aspectRatio,
L"jitterType", jitterType);
return crop;
}
/* static */ ImageTransform ReaderScale(int width,
int height, int channels, const wchar_t* interpolations,
const wchar_t* scaleMode, int padValue)
{
ImageTransform scale;
scale.Add(L"type", L"Scale",
L"width", width,
L"height", height,
L"channels", channels,
L"interpolations", interpolations,
L"scaleMode", scaleMode,
L"padValue", padValue);
return scale;
}
/* static */ ImageTransform ReaderMean(const wchar_t* meanFile)
{
ImageTransform mean;
mean.Add(L"type", L"Mean", L"meanFile", meanFile);
return mean;
}
/* static */ ImageTransform ReaderColor(float brightnessRadius,
float contrastRadius, float saturationRadius)
{
ImageTransform color;
color.Add(L"type", L"Color",
L"brightnessRadius", brightnessRadius,
L"contrastRadius", contrastRadius,
L"saturationRadius", saturationRadius);
return color;
}
Deserializer ImageDeserializer(const std::wstring& fileName, const std::wstring& labelStreamName, size_t numLabels, const std::wstring& imageStreamName, const std::vector<ImageTransform>& transforms)
{
Deserializer img;
std::vector<DictionaryValue> actualTransforms;
std::transform(transforms.begin(), transforms.end(), std::back_inserter(actualTransforms), [](ImageTransform t) { return static_cast<DictionaryValue>(t); });
Dictionary labeldim;
labeldim[L"labelDim"] = numLabels;
Dictionary xforms;
xforms[L"transforms"] = actualTransforms;
Dictionary input;
input.Add(imageStreamName.c_str(), xforms, labelStreamName.c_str(), labeldim);
img.Add(L"type", L"ImageDeserializer", L"file", fileName, L"input", input);
return img;
}
Deserializer CTFDeserializer(const std::wstring& fileName, const std::vector<StreamConfiguration>& streams)
{
Deserializer ctf;
Dictionary input;
for (const auto& s : streams)
{
const auto& key = s.m_streamName;
Dictionary stream;
stream.Add(L"alias", s.m_streamAlias, L"dim", s.m_dim, L"format", s.m_isSparse ? L"sparse" : L"dense");
input[key] = stream;
}
ctf.Add(L"type", L"CNTKTextFormatDeserializer", L"file", fileName, L"input", input);
return ctf;
}
Deserializer HTKFeatureDeserializer(const std::vector<HTKFeatureConfiguration>& streams)
{
Deserializer htk;
Dictionary input;
for (const auto& s : streams)
{
const auto& key = s.m_streamName;
Dictionary stream;
std::vector<DictionaryValue> ctxWindow = { DictionaryValue(s.m_left), DictionaryValue(s.m_right) };
stream.Add(L"scpFile", s.m_scp, L"dim", s.m_dim, L"contextWindow", ctxWindow, L"expandToUtterance", s.m_broadcast);
input[key] = stream;
}
htk.Add(L"type", L"HTKFeatureDeserializer", L"input", input);
return htk;
}
Deserializer HTKMLFDeserializer(const std::wstring& streamName, const std::wstring& labelMappingFile, size_t dimension, const std::vector<std::wstring>& mlfFiles)
{
Deserializer htk;
Dictionary stream;
Dictionary labels;
labels.Add(L"labelMappingFile", labelMappingFile, L"dim", dimension);
std::vector<DictionaryValue> actualFiles;
std::transform(mlfFiles.begin(), mlfFiles.end(), std::back_inserter(actualFiles), [](const std::wstring& s) {return static_cast<DictionaryValue>(s); });
if (actualFiles.size() > 1)
labels[L"mlfFileList"] = actualFiles;
else if (actualFiles.size() == 1)
labels[L"mlfFile"] = actualFiles[0];
else
LogicError("HTKMLFDeserializer: No mlf files were specified");
stream[streamName] = labels;
htk.Add(L"type", L"HTKMLFDeserializer", L"input", stream);
return htk;
}
}

Просмотреть файл

@ -79,7 +79,7 @@ namespace CNTK
/*static*/ const std::wstring PrimitiveFunction::AttributeNameDeletionPenalty = L"DeletionPenalty";
/*static*/ const std::wstring PrimitiveFunction::AttributeNameInsertionPenalty = L"InsertionPenalty";
/*static*/ const std::wstring PrimitiveFunction::AttributeNameSquashInputs = L"SquashInputs";
/*static*/ const std::wstring PrimitiveFunction::AttributeNameSamplesToIgnore = L"SamplesToIgnore";
/*static*/ const std::wstring PrimitiveFunction::AttributeNameTokensToIgnore = L"TokensToIgnore";
/*static*/ DataType PrimitiveFunction::GetOutputDataType(PrimitiveOpType op, std::vector<Variable>& inputs, bool inferDimensions)
{

Просмотреть файл

@ -235,7 +235,7 @@ namespace CNTK
static const std::wstring AttributeNameDeletionPenalty;
static const std::wstring AttributeNameInsertionPenalty;
static const std::wstring AttributeNameSquashInputs;
static const std::wstring AttributeNameSamplesToIgnore;
static const std::wstring AttributeNameTokensToIgnore;
protected:
PrimitiveFunction(PrimitiveOpType op, const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& functionName, const std::wstring& uid)

Просмотреть файл

@ -72,6 +72,7 @@ namespace CNTK
NDCG = 60,
EditDistanceError = 61,
NoOp = 62,
LabelsToGraph = 63
// New op types should only be appended to the end of this list.
// If you append here, also add checks in SerializationTests (CheckEnumValuesNotModified)
// and bump up PrimitiveFunction::s_serializationVersion and update PrimitiveFunction::Deserialize

Просмотреть файл

@ -87,6 +87,13 @@ namespace CNTK
return result;
}
Variable Variable::NonCompositePreservingCopy() const
{
Variable copy = *this;
copy.m_outputComposite = nullptr;
return copy;
}
void Variable::SetOwner(Function* ownerFunction)
{
if (Kind() != VariableKind::Output)

Просмотреть файл

Просмотреть файл

@ -14,7 +14,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
std::atomic<bool> Globals::m_forceConstantRandomSeed(false);
std::atomic<bool> Globals::m_enableShareNodeValueMatrices(true);
std::atomic<bool> Globals::m_enableHyperCompressMemory(false);
std::atomic<bool> Globals::m_optimizeGradientAccumulation(true);
}}}

Просмотреть файл

@ -151,6 +151,8 @@ public:
}
};
std::function<std::string(size_t)> m_getKeyById;
private:
typedef map<std::wstring, Input> MapType;
MapType inputs;

Просмотреть файл

@ -28,15 +28,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static void SetShareNodeValueMatrices(bool enable) { m_enableShareNodeValueMatrices = enable; }
static bool ShouldEnableShareNodeValueMatrices() { return m_enableShareNodeValueMatrices; }
static void SetHyperCompressMemory(bool enable) { m_enableHyperCompressMemory = enable; }
static bool ShouldEnableHyperCompressMemory() { return m_enableHyperCompressMemory; }
private:
static std::atomic<bool> m_forceDeterministicAlgorithms;
// The global flag to enable matrices values in forward and backward prop
static std::atomic<bool> m_enableShareNodeValueMatrices;
// The global flag to enable hyper memory compression
static std::atomic<bool> m_enableHyperCompressMemory;
static std::atomic<bool> m_forceConstantRandomSeed;
static std::atomic<bool> m_optimizeGradientAccumulation;
};

Просмотреть файл

@ -1,14 +1,14 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
//
#pragma once
#if HAS_MPI
// Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#ms-mpi or
// https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#open-mpi for setup instructions
// of an MPI implementation on your platform.
#ifdef _MSC_VER
// Suppress warning for non-ASCII characters in MS-MPI headers
#pragma warning(push)
@ -18,7 +18,25 @@
#else
#include "mpi.h"
#endif
#pragma comment(lib, "msmpi.lib")
#else
// Note: the following macros/typedefs define some of the MPI related functions and constants such that code
// using these functionality will compile cleanly - but will not actually perform the MPI operation.
// The clean way to go is to move any code related to mpi into the mpiwrapper class implementation and decide
// in this class if to use mpi.h or not.
typedef void *MPI_Comm;
typedef enum _MPI_Datatype { MPI_CHAR, MPI_INT, MPI_FLOAT, MPI_DOUBLE, MPI_UNSIGNED, MPI_LONG_LONG_INT } MPI_Datatype;
#define MPI_IN_PLACE ((void*)(int)-1)
#define MPI_SUM ((MPI_Op)0x58000003)
#define MPI_STATUSES_IGNORE (MPI_Status*)1
#define MPI_STATUS_IGNORE (MPI_Status*)1
#define MPI_UNDEFINED (-32766)
typedef int MPI_Op;
typedef int MPI_Request;
typedef void *MPI_Status;
#endif
#include <errno.h>
#include <string>
@ -28,8 +46,6 @@
#include "CommonMatrix.h"
#define FFLUSH_SUCCESS 0
namespace Microsoft { namespace MSR { namespace CNTK {
struct MpiFail : public std::string
@ -40,481 +56,128 @@ struct MpiFail : public std::string
}
};
static int operator||(int rc, const MpiFail &what)
{
if (rc == MPI_SUCCESS)
{
return rc;
}
fprintf(stderr, "%s, MPI error %d\n", what.c_str(), rc);
fflush(stderr);
// (special case: we use that code to indicate a missing msmpi.dll...)
if (rc != MPI_ERR_INTERN)
{
char errbuf[MPI_MAX_ERROR_STRING + 1] = {0};
int len;
MPI_Error_string(rc, &errbuf[0], &len);
fprintf(stderr, "%s, MPI error %d: %s\n", what.c_str(), rc, errbuf);
fflush(stderr);
// we abort through this, so that the MPI system gets the memo
MPI_Abort(MPI_COMM_WORLD, rc);
// TODO: or does that only signal an issue, and we should still terminate ourselves?
// BUGBUG: We'd also need to Abort through the other sub-set communicator
}
RuntimeError("%s", what.c_str());
}
extern int operator||(int rc, const MpiFail &what);
class MPIWrapper;
typedef std::shared_ptr<MPIWrapper> MPIWrapperPtr;
extern "C" void GetMpiWrapper(MPIWrapper **mpi);
// Note: This is now a pure interface, so please don't add
// any functionality to this class.
// Instead, make your own implementation class, add/change
// functions there as needed and use a private interface to
// these functions.
// In case you need to add functions that affect all
// implementations, add a pure virtual function here and
// update any affected implementation.
class MPIWrapper : public std::enable_shared_from_this<MPIWrapper>
{
int m_myRank;
std::wstring m_myName;
int m_numMPINodes;
size_t m_numNodesInUse;
bool m_multiHost;
// MPI communicator that reflects the current subset selection
MPI_Comm m_currentComm;
static MPIWrapperPtr s_mpi;
// MPI_Init() with delay-loading the msmpi.dll (possibly causing a failure if missing; we want to catch that)
int MPI_Init_DL()
{
#ifdef WIN32
__try
#endif
{
// don't initialize if that has been done already
int flag = 0;
MPI_Initialized(&flag);
if (flag)
return MPI_SUCCESS;
int argc = 0;
char **argv = NULL;
// TODO(qiwye) Multiverso(parameter server) will benefit from MPI_THREAD_MULTIPLE .
int requiredThreadLevelSupport = MPI_THREAD_SERIALIZED;
int provided;
int ret = MPI_Init_thread(&argc, &argv, requiredThreadLevelSupport, &provided);
if (provided != requiredThreadLevelSupport)
LogicError("Failed to initialize MPI with the desired level of thread support");
return ret;
}
#ifdef WIN32
__except (EXCEPTION_EXECUTE_HANDLER)
{
fprintf(stderr, "mpihelper: msmpi.dll missing\n");
return MPI_ERR_INTERN;
}
#endif
}
// Workaround for the issue with MPI hanging when we have non-0 exit codes from CNTK processes
// OpenMPI has a confirmed race condition on killing child process vs. handling their non-zero exit statuses, resulting
// in a deadlock, where all processes killed but MPI is still waiting.
// This happens when several perfectly synchronized processes (for example on MPI barrier)
// simulatenously exit with non-0 exit code.
// As a workaround, we simply sleep 50*rank miliseconds, effectively "de-synchronizing processes" at exit,
// allowing MPI to sequentially handle terminations
static int s_myRank;
static void MPIWorkaroundAtExit()
{
Sleep(s_myRank * 50);
}
public:
MPIWrapper()
: m_currentComm(MPI_COMM_WORLD)
{
static bool initialized = false;
if (initialized)
{
LogicError("MPIWrapper: this is a singleton class that can only be instantiated once per process");
}
MPIWrapper() {}
virtual ~MPIWrapper() {}
initialized = true;
if (GetMathLibTraceLevel() > 0)
{
fprintf(stderr, "MPIWrapper: initializing MPI\n");
fflush(stderr);
}
MPI_Init_DL() || MpiFail("mpiaggregator: MPI_Init");
MPI_Comm_rank(MPI_COMM_WORLD, &m_myRank);
MPI_Comm_size(MPI_COMM_WORLD, &m_numMPINodes);
m_numNodesInUse = m_numMPINodes;
m_multiHost = true;
// Verify that the environment variable used by GetTotalNumberOfMPINodes()
// matches what the MPI API says. There're actually two possible cases:
// 1) when we're running with mpiexec both values have to match;
// 2) when we're running without mpiexec, the former will return 0, and
// the later will be set to 1.
assert((GetTotalNumberOfMPINodes() == 0 && m_numNodesInUse == 1) ||
(GetTotalNumberOfMPINodes() == m_numNodesInUse));
char name[BUFSIZ];
int length;
MPI_Get_processor_name(name, &length);
m_myName = std::wstring(name, name+length);
// Applying MPI workaround
s_myRank = m_myRank;
atexit(&MPIWrapper::MPIWorkaroundAtExit);
// by default we use all of them
RequestNodes("MPIWrapper");
if (GetMathLibTraceLevel() > 0)
{
if (m_numMPINodes > 1)
fprintf(stderr, "mpihelper: we are cog %d in a gearbox of %d\n", (int) m_myRank, (int) m_numMPINodes);
else
fprintf(stderr, "mpihelper: only one MPI process: MPI operation will be boring\n");
fflush(stderr);
}
// do an initial handshake
Ping("mpihelper");
// stagger the jobs just a little to get a sort-of deterministic order e.g. in GPU allocation when running on one machine
// continue 0.5 seconds apart
::Sleep((DWORD)(500 * CurrentNodeRank()));
}
static MPIWrapperPtr GetInstance(bool create = false);
static void DeleteInstance();
static MPIWrapperPtr s_mpi;
// Note that specifically, this function is such that it does not require
// MPI initialization. Moreover, it can be used without actually loading any
// MPI libs.
// TODO: Once we move to dynamic loading for MPI libs on Linux, move it to utilities.
static int GetTotalNumberOfMPINodes()
{
#ifdef WIN32
const char* p = std::getenv("PMI_SIZE");
#else
const char* p = std::getenv("OMPI_COMM_WORLD_SIZE");
#endif
if (!p)
{
return 0;
}
else
{
return std::stoi(string(p));
}
}
static int GetTotalNumberOfMPINodes();
// Note: we don't clear the sub-communication here although we should, because in case of a crash, this prevents the EXE from terminating.
// It's OK since this class is a singleton anyway that gets instantiated exactly once at program startup.
~MPIWrapper()
{
if (GetMathLibTraceLevel() > 0)
{
fprintf(stderr, "~MPIWrapper\n");
}
// Do not finalize in event of an exception since calling MPI_Finalize without
// all pending communications being finished results in a hang
int rc = fflush(stderr);
if (!std::uncaught_exception())
{
if (rc != FFLUSH_SUCCESS)
{
#ifdef _WIN32
RuntimeError("MPIWrapper: Failed to flush stderr, %d", ::GetLastError());
#else
RuntimeError("MPIWrapper: Failed to flush stderr, %d", errno);
#endif
}
MPI_Finalize();
}
}
private:
void Ping(const char *msg) const
{
#undef USE2NDCOMM
#ifndef USE2NDCOMM
if (NumNodesInUse() != m_numMPINodes)
{
fprintf(stderr, "ping [%s]: cannot be applied to subset (%d) of nodes, skipping\n", msg, (int) NumNodesInUse());
fflush(stderr);
return;
}
#endif
std::array<int, 1> handshake;
handshake[0] = 1;
if (GetMathLibTraceLevel() > 0)
{
fprintf(stderr, "ping [%s]: %d nodes pinging each other\n", msg, (int) NumNodesInUse());
fflush(stderr);
}
AllReduce(handshake);
if (GetMathLibTraceLevel() > 0)
{
fprintf(stderr, "ping [%s]: all %d nodes responded\n", msg, handshake[0]);
fflush(stderr);
}
}
void RequestNodes(const char *msg, size_t requestednodes = SIZE_MAX /*default: all*/)
{
Ping("requestnodes (before change)");
// undo current split
#ifdef USE2NDCOMM
if (m_currentComm != MPI_COMM_WORLD /*no subset*/ && m_currentComm != MPI_COMM_NULL /*idle nodes*/)
{
fprintf(stderr, "requestnodes: MPI_Comm_free %x\n", (int) m_currentComm);
fflush(stderr);
MPI_Comm_free(&m_currentComm) || MpiFail("requestnodes: MPI_Comm_free"); // will leave MPI_COMM_NULL here
}
#endif
// reset to MPI_COMM_WORLD
m_currentComm = MPI_COMM_WORLD;
// create a new split (unless all nodes were requested)
if (requestednodes < (size_t) m_numMPINodes)
{
#ifdef USE2NDCOMM
fprintf(stderr, "requestnodes: MPI_Comm_split %d\n", (node() < requestednodes) ? 1 : MPI_UNDEFINED);
fflush(stderr);
MPI_Comm_split(communicator(), (node() < requestednodes) ? 1 : MPI_UNDEFINED, 0, &m_currentComm) || MpiFail("requestnodes: MPI_Comm_split");
fprintf(stderr, "requestnodes: MPI_Comm_split -> %x\n", (int) m_currentComm);
fflush(stderr);
#endif
}
else
{
// leave m_currentComm as MPI_COMM_WORLD
// and clip to #nodes
requestednodes = m_numMPINodes;
}
m_numNodesInUse = requestednodes;
if (GetMathLibTraceLevel() > 0)
{
fprintf(stderr, "requestnodes [%s]: using %d out of %d MPI nodes (%d requested); we (%d) are %s\n",
msg, (int) m_numNodesInUse, (int) m_numMPINodes, (int) requestednodes,
(int) CurrentNodeRank(), IsIdle() ? "out (idle)" : "in (participating)");
fflush(stderr);
}
Ping("requestnodes (after change)");
// If all ranks run on a single host, we can enable optimized communication
// paths (e.g. NCCL). To determine if a single machine is being used, we
// check that MPI_Get_processor_name matches for all ranks.
const int nameMax = MPI_MAX_PROCESSOR_NAME + 1;
char myName[nameMax] = {0};
int myNameLen = 0;
MPI_Get_processor_name(myName, &myNameLen) || MpiFail("requestnodes: MPI_Get_processor_name");
myName[myNameLen] = '\0';
std::vector<char> nameBuffer(m_numNodesInUse * nameMax);
char* allNames = nameBuffer.data();
MPI_Allgather(myName, nameMax, MPI_CHAR, allNames, nameMax, MPI_CHAR, m_currentComm)
|| MpiFail("requestnodes: MPI_Allgather");
m_multiHost = false;
for(size_t i=1; i<m_numNodesInUse; i++)
{
if (strcmp(allNames, allNames+i*nameMax) != 0)
{
m_multiHost = true;
break;
}
}
fprintf(stderr, "requestnodes [%s]: using %d out of %d MPI nodes on %s (%d requested); we (%d) are %s\n",
msg, (int) m_numNodesInUse, (int) m_numMPINodes, m_multiHost ? "multiple hosts" : "a single host",
(int) requestednodes, (int) CurrentNodeRank(), IsIdle() ? "out (idle)" : "in (participating)");
fflush(stderr);
}
public:
static MPIWrapperPtr GetInstance(bool create = false)
{
if (create)
{
if (s_mpi != nullptr)
LogicError("Creating MPIWrapper instance after a GetInstance call has been already made!");
else
s_mpi = std::make_shared<MPIWrapper>();
}
return s_mpi;
}
static void DeleteInstance()
{
s_mpi = nullptr;
}
MPI_Comm Communicator() const
{
return m_currentComm;
}
size_t NumNodesInUse() const
{
return m_numNodesInUse;
}
size_t CurrentNodeRank() const
{
return m_myRank;
}
std::wstring CurrentNodeName() const
{
return m_myName;
}
bool IsMainNode() const
{
return m_myRank == 0;
} // we are the chosen one--do extra stuff like saving the model to disk
bool IsIdle() const
{
return CurrentNodeRank() >= NumNodesInUse();
} // user had requested to not use this many nodes
bool UsingAllNodes() const
{
return NumNodesInUse() == m_numMPINodes;
} // all nodes participate (used to check whether we can use MPI_Allreduce directly)
size_t MainNodeRank() const
{
return 0;
}
bool IsMultiHost()
{
return m_multiHost;
}
virtual size_t NumNodesInUse() const = 0;
virtual size_t CurrentNodeRank() const = 0;
virtual bool IsMainNode() const = 0;
virtual std::wstring CurrentNodeName() const = 0;
virtual bool IsIdle() const = 0;
virtual bool UsingAllNodes() const = 0;
virtual size_t MainNodeRank() const = 0;
virtual bool IsMultiHost() const = 0;
// -----------------------------------------------------------------------
// data-exchange functions (wrappers around MPI functions)
// -----------------------------------------------------------------------
virtual int Finalize(void) = 0;
virtual int Wait(MPI_Request* request, MPI_Status* status) = 0;
virtual int Waitany(int count, MPI_Request array_of_requests[], int* index, MPI_Status* status) = 0;
virtual int Waitall(int count, MPI_Request array_of_requests[], MPI_Status array_of_statuses[]) = 0;
virtual int Isend(const void* buf, int count, MPI_Datatype datatype, int dest, int tag, /*MPI_Comm comm,*/ MPI_Request* request) = 0;
virtual int Recv(void* buf, int count, MPI_Datatype datatype, int source, int tag, /*MPI_Comm comm,*/ MPI_Status* status) = 0;
virtual int Irecv(void* buf, int count, MPI_Datatype datatype, int source, int tag, /*MPI_Comm comm,*/ MPI_Request* request) = 0;
virtual int Iallreduce(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, /*MPI_Comm comm,*/ MPI_Request* request) = 0;
virtual int Abort(int errorcode) = 0;
virtual int Error_string(int errorcode, char* string, int* resultlen) = 0;
// helpers to determine the MPI_Datatype of a pointer
static MPI_Datatype GetDataType(char *)
{
return MPI_CHAR;
}
static MPI_Datatype GetDataType(int *)
{
return MPI_INT;
}
static MPI_Datatype GetDataType(float *)
{
return MPI_FLOAT;
}
static MPI_Datatype GetDataType(double *)
{
return MPI_DOUBLE;
}
static MPI_Datatype GetDataType(size_t *)
{
return sizeof(size_t) == 4 ? MPI_UNSIGNED : MPI_LONG_LONG_INT;
}
static MPI_Datatype GetDataType(char *);
static MPI_Datatype GetDataType(int *);
static MPI_Datatype GetDataType(float *);
static MPI_Datatype GetDataType(double *);
static MPI_Datatype GetDataType(size_t *);
// allreduce of a vector
template <typename VECTORLIKEOBJECT>
void AllReduce(VECTORLIKEOBJECT &accumulator) const
{
auto *dataptr = accumulator.data();
size_t totalnumelements = accumulator.size();
// use MPI to compute the sum over all elements in (dataptr, totalnumelements) and redistribute to all nodes
AllReduce<typename VECTORLIKEOBJECT::value_type>(dataptr, totalnumelements);
}
virtual void AllReduce(std::vector<size_t>& accumulator) const = 0;
virtual void AllReduce(std::vector<int>& accumulator) const = 0;
virtual void AllReduce(std::vector<double>& accumulator) const = 0;
virtual void AllReduce(std::vector<float>& accumulator) const = 0;
// for raw pointer
template <class ElemType>
void AllReduce(ElemType* sendData, size_t numElements, MPI_Op op = MPI_SUM) const
{
AllReduce<ElemType>(static_cast<ElemType*>(MPI_IN_PLACE), sendData, numElements, op);
}
virtual void AllReduce(size_t* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduce(int* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduce(double* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduce(float* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
template <class ElemType>
void AllReduceAsync(ElemType* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const
{
AllReduceAsync<ElemType>(static_cast<ElemType*>(MPI_IN_PLACE), sendData, numElements, request, op);
}
virtual void AllReduce(size_t* sendData, size_t* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduce(int* sendData, int* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduce(double* sendData, double* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduce(float* sendData, float* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
template <class ElemType>
void AllGatherAsync(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, size_t numRecvElements, MPI_Request* request) const
{
MPI_Iallgather(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, (int)numRecvElements, GetDataType(receiveData), Communicator(), request) || MpiFail("AllReduceAsync: MPI_Iallgather");
}
virtual void AllReduceAsync(size_t* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduceAsync(int* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduceAsync(double* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduceAsync(float* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
template <class ElemType>
void AllGather(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, size_t numRecvElements) const
{
MPI_Allgather(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, (int)numRecvElements, GetDataType(receiveData), Communicator()) || MpiFail("AllReduceAsync: MPI_Allgather");
}
virtual void AllReduceAsync(size_t* sendData, size_t* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduceAsync(int* sendData, int* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduceAsync(double* sendData, double* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
virtual void AllReduceAsync(float* sendData, float* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
template <class ElemType>
void AllReduceAsync(ElemType *sendData, ElemType *receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const
{
MPI_Iallreduce(sendData, receiveData, (int)numElements, GetDataType(sendData), op, Communicator(), request) || MpiFail("AllReduceAsync: MPI_Iallreduce");
}
virtual void Bcast(size_t* sendData, size_t numElements, size_t srcRank) = 0;
virtual void Bcast(double* sendData, size_t numElements, size_t srcRank) = 0;
virtual void Bcast(float* sendData, size_t numElements, size_t srcRank) = 0;
virtual void Bcast(void* buffer, int count, MPI_Datatype datatype, int root) = 0;
template <class ElemType>
void AllReduce(ElemType *sendData, ElemType *receiveData, size_t numElements, MPI_Op op = MPI_SUM) const
{
MPI_Allreduce(sendData, receiveData, (int)numElements, GetDataType(sendData), op, Communicator()) || MpiFail("AllReduce: MPI_Allreduce");
}
virtual void AllGatherAsync(const size_t *sendData, size_t numSendElements, size_t *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;
virtual void AllGatherAsync(const int *sendData, size_t numSendElements, int *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;
virtual void AllGatherAsync(const float *sendData, size_t numSendElements, float *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;
virtual void AllGatherAsync(const double *sendData, size_t numSendElements, double *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;
template <class ElemType>
void Gather(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, size_t numRecvElements, size_t rootRank) const
{
MPI_Gather(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, (int)numRecvElements, GetDataType(receiveData), (int)rootRank, Communicator()) || MpiFail("AllReduceAsync: MPI_Gather");
}
virtual void AllGather(const size_t *sendData, size_t numSendElements, size_t *receiveData, size_t numRecvElements) const = 0;
virtual void AllGather(const int *sendData, size_t numSendElements, int *receiveData, size_t numRecvElements) const = 0;
virtual void AllGather(const float *sendData, size_t numSendElements, float *receiveData, size_t numRecvElements) const = 0;
virtual void AllGather(const double *sendData, size_t numSendElements, double *receiveData, size_t numRecvElements) const = 0;
virtual void Allgather(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype) const = 0;
template <class ElemType>
void Gatherv(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, int recvCounts[], int offsets[], size_t rootRank) const
{
MPI_Gatherv(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, recvCounts, offsets, GetDataType(receiveData), (int)rootRank, Communicator()) || MpiFail("AllReduceAsync: MPI_Gatherv");
}
virtual void Gather(const size_t *sendData, size_t numSendElements, size_t *receiveData, size_t numRecvElements, size_t rootRank) const = 0;
virtual void Gather(const int *sendData, size_t numSendElements, int *receiveData, size_t numRecvElements, size_t rootRank) const = 0;
virtual void Gather(const float *sendData, size_t numSendElements, float *receiveData, size_t numRecvElements, size_t rootRank) const = 0;
virtual void Gather(const double *sendData, size_t numSendElements, double *receiveData, size_t numRecvElements, size_t rootRank) const = 0;
template <class ElemType>
void Bcast(ElemType *pData, size_t nData, size_t srcRank)
{
MPI_Bcast(pData, (int) nData, GetDataType(pData), (int) srcRank, Communicator()) || MpiFail("Bcast: MPI_Bcast");
}
// wait for an async request to finish
void Wait(MPI_Request* request)
{
MPI_Wait(request, MPI_STATUSES_IGNORE) || MpiFail("Wait: MPI_Wait");
}
void WaitAny(MPI_Request* requests, int numRequests, int* index)
{
MPI_Waitany(numRequests, requests, index, MPI_STATUSES_IGNORE) || MpiFail("WaitAny: MPI_Waitany");
}
virtual void Gatherv(const size_t *sendData, size_t numSendElements, size_t *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
virtual void Gatherv(const char *sendData, size_t numSendElements, char *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
virtual void Gatherv(const int *sendData, size_t numSendElements, int *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
virtual void Gatherv(const float *sendData, size_t numSendElements, float *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
virtual void Gatherv(const double *sendData, size_t numSendElements, double *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
// wait for all ranks to reach here
void WaitAll()
{
MPI_Barrier(m_currentComm) || MpiFail("waitall: MPI_Barrier");
}
void WaitAll(std::vector<MPI_Request>& requests)
{
MPI_Waitall((int)requests.size(), &requests[0], MPI_STATUSES_IGNORE) || MpiFail("waitall: MPI_Waitall");
}
virtual int WaitAll() = 0;
virtual void WaitAny(MPI_Request* requests, int numRequests, int* index) = 0;
virtual void Wait(MPI_Request* request) = 0;
virtual int WaitAll(std::vector<MPI_Request>& requests) = 0;
};
}}}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -36,15 +36,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
template <>
vector<shared_ptr<Matrix<float>>>& MatrixPool::GetReleasedMatrices<float>()
vector<MemRequestInfo<float>>& MatrixPool::GetMemRequestInfoVec<float>()
{
return m_releasedFloatMatrices;
return m_memRequestInfoFloatVec;
}
template <>
vector<shared_ptr<Matrix<double>>>& MatrixPool::GetReleasedMatrices<double>()
vector<MemRequestInfo<double>>& MatrixPool::GetMemRequestInfoVec<double>()
{
return m_releasedDoubleMatrices;
return m_memRequestInfoDoubleVec;
}
// -----------------------------------------------------------------------
@ -463,7 +463,7 @@ bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
nodePtr->OperationName() == OperationNameOf(CrossEntropyNode) ||
nodePtr->OperationName() == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode) ||
nodePtr->OperationName() == OperationNameOf(ClassificationErrorNode) ||
nodePtr->OperationName() == OperationNameOf(EditDistanceErrorNode) ||
nodePtr->OperationName() == OperationNameOf(ForwardBackwardNode) ||
#ifdef COMING_SOON
nodePtr->OperationName() == OperationNameOf(CRFNode) ||
#endif

Просмотреть файл

@ -49,6 +49,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
else if (nodeType == OperationNameOf(CropNode)) return New<CropNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(CrossEntropyNode)) return New<CrossEntropyNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(CrossEntropyWithSoftmaxNode)) return New<CrossEntropyWithSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(ForwardBackwardNode)) return New<ForwardBackwardNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(DiagonalNode)) return New<DiagonalNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(DiagTimesNode)) return New<DiagTimesNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(DropoutNode)) return New<DropoutNode<ElemType>>(forward<_Types>(_Args)...);
@ -93,6 +94,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
else if (nodeType == OperationNameOf(PerDimMeanVarNormalizationNode)) return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(PerDimMeanVarDeNormalizationNode)) return New<PerDimMeanVarDeNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(PassNode)) return New<PassNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(LabelsToGraphNode)) return New<LabelsToGraphNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(PlusNode)) return New<PlusNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(RandomSampleNode)) return New<RandomSampleNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(RandomSampleInclusionFrequencyNode)) return New<RandomSampleInclusionFrequencyNode<ElemType>>(forward<_Types>(_Args)...);
@ -430,9 +432,9 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Class
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> samplesToIgnore, const std::wstring nodeName)
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> tokensToIgnore, const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<EditDistanceErrorNode<ElemType>>(net.GetDeviceId(), subPen, delPen, insPen, squashInputs, samplesToIgnore, nodeName), { a, b });
return net.AddNodeToNetAndAttachInputs(New<EditDistanceErrorNode<ElemType>>(net.GetDeviceId(), nodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore), { a, b });
}
template <class ElemType>
@ -499,6 +501,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Seque
return net.AddNodeToNetAndAttachInputs(New<SequenceWithSoftmaxNode<ElemType>>(net.GetDeviceId(), nodeName), { label, prediction, loglikelihood });
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ForwardBackward(const ComputationNodePtr label, const ComputationNodePtr prediction, int blankTokenId, int delayConstraint, const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<ForwardBackwardNode<ElemType>>(net.GetDeviceId(), nodeName, blankTokenId, delayConstraint), { label, prediction });
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NoiseContrastiveEstimation(const ComputationNodePtr label, const ComputationNodePtr prediction,
const ComputationNodePtr input_weight,
@ -570,6 +578,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Pass(
return net.AddNodeToNetAndAttachInputs(New<PassNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LabelsToGraph(const ComputationNodePtr a, const std::wstring& nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<LabelsToGraphNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::DynamicAxis(const ComputationNodePtr a, const std::wstring& nodeName)
{

Просмотреть файл

@ -126,11 +126,12 @@ public:
ComputationNodePtr CosDistance(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr CrossEntropy(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
ComputationNodePtr CrossEntropyWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
ComputationNodePtr ForwardBackward(const ComputationNodePtr label, const ComputationNodePtr prediction, int blankTokenId, int delayConstraint, const std::wstring nodeName = L"");
ComputationNodePtr DiagTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr Diagonal(const ComputationNodePtr a, const std::wstring nodeName = L"");
ComputationNodePtr Dropout(const ComputationNodePtr a, const std::wstring nodeName = L"");
ComputationNodePtr DummyCriterion(const ComputationNodePtr objectives, const ComputationNodePtr derivatives, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
ComputationNodePtr EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> samplesToIgnore, const std::wstring nodeName = L"");
ComputationNodePtr EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> tokensToIgnore, const std::wstring nodeName = L"");
ComputationNodePtr ElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr DynamicAxis(const ComputationNodePtr a, const std::wstring& nodeName = L"");
ComputationNodePtr ClassificationError(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
@ -159,6 +160,7 @@ public:
ComputationNodePtr Negate(const ComputationNodePtr a, const std::wstring nodeName = L"");
ComputationNodePtr NoiseContrastiveEstimation(const ComputationNodePtr label, const ComputationNodePtr prediction, const ComputationNodePtr input_weight, const ComputationNodePtr input_bias, const std::wstring nodeName = L"", NCEEvalMode mode = NCEEvalMode::None);
ComputationNodePtr Pass(const ComputationNodePtr a, const std::wstring& nodeName = L"");
ComputationNodePtr LabelsToGraph(const ComputationNodePtr a, const std::wstring& nodeName = L"");
ComputationNodePtr PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
ComputationNodePtr PerDimMeanVarDeNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");
ComputationNodePtr PerDimMeanVarNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");

Просмотреть файл

@ -943,31 +943,41 @@ void ComputationNetwork::PrintMemorySharingStructure(const vector<ComputationNod
size_t numUnshared = 0;
for (const auto& item : memSharingStructure)
{
if (item.second.size() < 2) // only print actually shared matrices
if (item.second.size() < 2) // unshared matrices
numUnshared++;
else
else // shared matrices
numShared++;
}
fprintf(stderr, "\nMemory Sharing: Out of %d matrices, %d are shared as %d, and %d are not shared.\n\n", (int)numMatrices, (int)(numMatrices - numUnshared), (int)numShared, (int)numUnshared);
fprintf(stderr, "\nMemory Sharing: Out of %d matrices, %d are shared as %d, and %d are not shared.\n", (int)numMatrices, (int)(numMatrices - numUnshared), (int)numShared, (int)numUnshared);
fprintf(stderr, "\nHere are the ones that share memory:\n");
for (const auto& item : memSharingStructure)
{
if (item.second.size() < 2) // only print actually shared matrices
continue;
// Format:
// { node1
// node2 }
// { node3
// node4
// node5 }
// where unshared nodes are not printed.
const char* delim = "\t{ ";
for (const auto& memShareInfo : item.second)
if (item.second.size() >= 2)
{
fprintf(stderr, "%s%ls", delim, memShareInfo.c_str());
delim = "\n\t ";
// Format:
// { node1
// node2 }
// { node3
// node4
// node5 }
const char* delim = "\t{ ";
for (const auto& memShareInfo : item.second)
{
fprintf(stderr, "%s%ls", delim, memShareInfo.c_str());
delim = "\n\t ";
}
fprintf(stderr, " }\n");
}
}
fprintf(stderr, "\nHere are the ones that don't share memory:\n");
for (const auto& item : memSharingStructure)
{
if (item.second.size() < 2)
{
fprintf(stderr, "\t{%ls}\n", item.second.begin()->c_str());
}
fprintf(stderr, " }\n");
}
fprintf(stderr, "\n");
}
@ -1003,7 +1013,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
// Due to special topology, if a node is solely induced by parameters, its function value should not be shared
MarkValueNonSharableNodes();
bool performingBackPropagation = (trainRootNode != nullptr) || (Globals::ShouldEnableHyperCompressMemory());
bool performingBackPropagation = (trainRootNode != nullptr);
// Construct the composite forward prop eval order by enumerating the
// nodes corresponding to each of our roots in global eval oder
@ -1062,6 +1072,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
}
}
m_matrixPool.ResetStepCounter();
set<ComputationNodeBasePtr> completedEvaluate;
for (auto& nodeIter : compositeForwardPropEvalOrder)
{
@ -1127,8 +1138,16 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
}
}
m_matrixPool.OptimizedMemoryAllocation();
m_areMatricesAllocated = true;
// TO DO: At the time of AllocateAllMatrices we don't know the minibatch size. In theory one may allocate memory again once we start to receive
// data from the reader (and the minibatch size is known). For some problems, minibatch size can change constantly, and there needs to be a
// tradeoff in deciding how frequent to run optimized memory allocation. For now, we do it only once at the very beginning for speed concerns.
// TO DO: when some matrices are sparse, the memory size request may be wrong. One may need to call OptimizedMemoryAllocation later again
// if the requests of sparse allocation and release are re-processed correctly. Future work.
// print the memory sharing structure
if (TraceLevel() > 0)
PrintMemorySharingStructure(GetAllNodes());

Просмотреть файл

@ -626,14 +626,16 @@ template <class ElemType>
// 'transpose' means print one row per sample (non-transposed is one column per sample).
// 'isSparse' will print all non-zero values as one row (non-transposed, which makes sense for one-hot) or column (transposed).
template <class ElemType>
void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const FrameRange& fr,
void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f,
const FrameRange& fr,
size_t onlyUpToRow, size_t onlyUpToT, bool transpose, bool isCategoryLabel, bool isSparse,
const vector<string>& labelMapping, const string& sequenceSeparator,
const string& sequencePrologue, const string& sequenceEpilogue,
const string& elementSeparator, const string& sampleSeparator,
string valueFormatString,
bool outputGradient,
bool onlyShowAbsSumForDense) const
bool onlyShowAbsSumForDense,
std::function<std::string(size_t)> getKeyById) const
{
// get minibatch matrix -> matData, matRows, matStride
const Matrix<ElemType>& outputValues = outputGradient ? Gradient() : Value();
@ -716,6 +718,8 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
if (s > 0)
fprintfOrDie(f, "%s", sequenceSeparator.c_str());
if (getKeyById)
fprintfOrDie(f, "%s ", getKeyById(seqInfo.seqId).c_str());
fprintfOrDie(f, "%s", seqProl.c_str());
// output it according to our format specification

Просмотреть файл

@ -791,8 +791,7 @@ public:
void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
bool IsOutputNeededDuringBackprop() const
{
return (!Globals::ShouldEnableShareNodeValueMatrices() && !Globals::ShouldEnableHyperCompressMemory())
|| m_outputNeededDuringBackprop;
return !Globals::ShouldEnableShareNodeValueMatrices() || m_outputNeededDuringBackprop;
}
// -----------------------------------------------------------------------
@ -1680,20 +1679,6 @@ public:
#endif
// tracing
Trace();
// Any memory not needed could resize to zero immediately when HyperCompressMemory active. Since the memory won't really release,
// all these memory blocks are gathered into a memory pool. When the next request coming, the best fitting block will be chosen.
if (Globals::ShouldEnableHyperCompressMemory())
{
for (auto& input : GetInputs())
{
if (!input->IsOutputNeededDuringBackprop() && input->IsValueSharable())
{
auto inputNodePtr = DownCast(input);
inputNodePtr->Value().Resize(0, 0);
}
}
}
}
virtual void /*IComputationNode::*/BeginBackprop() override
@ -1728,9 +1713,9 @@ public:
}
}
#ifdef _DEBUG
virtual void /*IComputationNode::*/ EndBackprop() override
{
#ifdef _DEBUG
Base::EndBackprop();
#ifdef TRACK_GAP_NANS
for (size_t i = 0; i < m_inputs.size(); i++)
@ -1744,18 +1729,8 @@ public:
}
}
#endif
#endif
// We could release the gradient of value sharable nodes and all no-longer used memory generated in forward.
if (IsValueSharable() && Globals::ShouldEnableHyperCompressMemory())
{
if (GradientPtr())
Gradient().Resize(0, 0);
// canceling the graph dependency
if (IsOutputNeededDuringBackprop())
Value().Resize(0, 0);
}
}
#endif
// this is the entry point from Network; while it will call virtual BackpropTo() into the actual node implementation
// TODO: move to -Base (or -Network?)
@ -1816,10 +1791,12 @@ public:
}
// request matrices needed to do node function value evaluation
// for memory pool utilization optimizaiton, the requested pointer is not immediately useable until the entire network has gone through all requests
virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
{
size_t matrixSize = m_sampleLayout.GetNumElements();
if (IsValueSharable())
RequestMatrixFromPool(m_value, matrixPool);
RequestMatrixFromPool(m_value, matrixPool, matrixSize, HasMBLayout());
else
CreateMatrixIfNull(m_value);
}
@ -1844,7 +1821,8 @@ public:
// request matrices that are needed for gradient computation
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
{
RequestMatrixFromPool(m_gradient, matrixPool);
size_t matrixSize = m_sampleLayout.GetNumElements();
RequestMatrixFromPool(m_gradient, matrixPool, matrixSize, HasMBLayout());
}
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
@ -1889,18 +1867,20 @@ protected:
matrixPtr = make_shared<Matrix<ElemType>>(m_deviceId);
}
void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool)
// matrixSize is per sample size, if unknown or hard to estimate, set matrixSize = 0
// if the matrix's size will scale with minibatch size, set mbScale = true
void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize=0, bool mbScale=false)
{
if (matrixPtr == nullptr)
{
matrixPtr = matrixPool.Request<ElemType>(m_deviceId);
matrixPool.RequestAllocate<ElemType>(m_deviceId, &matrixPtr, matrixSize, mbScale);
}
}
void ReleaseMatrixToPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool)
{
assert(matrixPtr != nullptr);
matrixPool.Release<ElemType>(matrixPtr);
matrixPool.RequestRelease<ElemType>(&matrixPtr);
}
public:
@ -1915,7 +1895,8 @@ public:
const std::vector<std::string>& labelMapping, const std::string& sequenceSeparator,
const std::string& sequencePrologue, const std::string& sequenceEpilogue, const std::string& elementSeparator,
const std::string& sampleSeparator, std::string valueFormatString,
bool outputGradient = false, bool onlyShowAbsSumForDense = false) const;
bool outputGradient = false, bool onlyShowAbsSumForDense = false,
std::function<std::string(size_t)> getKeyById = std::function<std::string(size_t)>()) const;
// simple helper to log the content of a minibatch
void DebugLogMinibatch(bool outputGradient = false) const

Просмотреть файл

@ -220,7 +220,8 @@ protected:
ImageLayoutKind m_imageLayout;
size_t m_maxTempMemSizeInSamples;
shared_ptr<Matrix<ElemType>> m_tempMatrix;
shared_ptr<Matrix<ElemType>> m_tempMatrixForward;
shared_ptr<Matrix<ElemType>> m_tempMatrixBackward;
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
};
@ -239,7 +240,8 @@ protected: \
using Base::m_transpose; \
using Base::m_imageLayout; \
using Base::m_maxTempMemSizeInSamples; \
using Base::m_tempMatrix; \
using Base::m_tempMatrixForward; \
using Base::m_tempMatrixBackward; \
using Base::m_convEng; \
using Base::InferReductionDims; \
public:
@ -351,13 +353,13 @@ public:
const Matrix<ElemType>& input0 = InputRef(0).ValueAsMatrix();
Matrix<ElemType> sliceInput1Value = InputRef(1).ValueFor(fr);
if (!m_transpose)
m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrixForward);
else
{
// BackwardData adds results to the output so need to zero them out first.
// REVIEW alexeyk: should be rolled into BackwardData itself.
sliceOutputValue.SetValue(0);
m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, /*accumulateGradient =*/ true, *m_tempMatrix);
m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, /*accumulateGradient =*/ true, *m_tempMatrixForward);
}
}
@ -369,20 +371,20 @@ public:
auto& grad = InputRef(0).GradientAsMatrix();
auto sliceInput1Value = InputRef(1).ValueFor(fr);
if (!m_transpose)
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrixBackward);
else
m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrixBackward);
}
else if (inputIndex == 1) // derivative with respect to the input feature
{
auto& input0 = InputRef(0).ValueAsMatrix();
auto sliceInput1Grad = InputRef(1).GradientFor(fr);
if (!m_transpose)
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, !Input(inputIndex)->ParentOverwritesGradient(), *m_tempMatrix);
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, !Input(inputIndex)->ParentOverwritesGradient(), *m_tempMatrixBackward);
else
{
// REVIEW alexeyk: Forward overwrites values in sliceInput1Grad. Should handle correctly instead.
m_convEng->Forward(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
m_convEng->Forward(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrixBackward);
}
}
}
@ -500,25 +502,26 @@ public:
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
{
Base::RequestMatricesBeforeForwardProp(matrixPool);
RequestMatrixFromPool(m_tempMatrix, matrixPool);
RequestMatrixFromPool(m_tempMatrixForward, matrixPool);
}
//void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) override
//{
// Base::ReleaseMatricesAfterForwardProp(matrixPool);
// ReleaseMatrixToPool(m_tempMatrix, matrixPool);
//}
// m_tempMatrixForward is only used as workspace for convolution, we can release it immediately afterwards
void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) override
{
Base::ReleaseMatricesAfterForwardProp(matrixPool);
ReleaseMatrixToPool(m_tempMatrixForward, matrixPool);
}
//void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
//{
// Base::RequestMatricesBeforeBackprop(matrixPool);
// RequestMatrixFromPool(m_tempMatrix, matrixPool);
//}
void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
{
Base::RequestMatricesBeforeBackprop(matrixPool);
RequestMatrixFromPool(m_tempMatrixBackward, matrixPool);
}
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
ReleaseMatrixToPool(m_tempMatrixBackward, matrixPool);
}
void SetmMaxTempMemSizeInSamples(const size_t maxTempMemSizeInSamples)
@ -530,6 +533,8 @@ public:
bool IsConvolution2D() const { return m_convolution2D; }
bool OutputUsedInComputingInputNodesGradients() const override { return false; }
private:
using TransformerNode::m_transforms;
using ConvolutionNodeBase<ElemType>::ComputeFilterTransform;
@ -600,9 +605,12 @@ public:
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
{
Base::RequestMatricesBeforeForwardProp(matrixPool);
RequestMatrixFromPool(m_tempMatrix, matrixPool);
size_t matrixSize = m_sampleLayout.GetNumElements();
RequestMatrixFromPool(m_tempMatrix, matrixPool, matrixSize, true);
}
// m_tempMatrix cannot be released after Forward Prop because its content (argmax) is used for back prop.
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
{
Base::ReleaseMatricesAfterBackprop(matrixPool);

Просмотреть файл

@ -461,7 +461,7 @@ template class NDCG1EvalNode<double>;
// Edit distance error evaluation node with the option of specifying penalty of substitution, deletion and insertion, as well as squashing the input sequences and ignoring certain samples.
// Using the classic DP algorithm as described in https://en.wikipedia.org/wiki/Edit_distance, adjusted to take into account the penalties.
//
// The node allows to squash sequences of repeating labels and ignore certain labels. For example, if squashInputs is true and samplesToIgnore contains label '-' then
// The node allows to squash sequences of repeating labels and ignore certain labels. For example, if squashInputs is true and tokensToIgnore contains label '-' then
// given first input sequence as s1="a-ab-" and second as s2="-aa--abb" the edit distance will be computed against s1' = "aab" and s2' = "aab".
//
// The returned error is computed as: EditDistance(s1,s2) * length(s1') / length(s1)
@ -480,21 +480,17 @@ public:
// delPen - deletion penalty
// insPen - insertion penalty
// squashInputs - whether to merge sequences of identical samples.
// samplesToIgnore - list of samples to ignore during edit distance evaluation
EditDistanceErrorNode(DEVICEID_TYPE deviceId, float subPen, float delPen, float insPen, bool squashInputs, std::vector<size_t> samplesToIgnore, const wstring & name)
: Base(deviceId, name), m_subPen(subPen), m_delPen(delPen), m_insPen(insPen), m_squashInputs(squashInputs), m_SamplesToIgnore(samplesToIgnore)
// tokensToIgnore - list of samples to ignore during edit distance evaluation
EditDistanceErrorNode(DEVICEID_TYPE deviceId, const wstring & name, float subPen = 0.0f, float delPen = 0.0f, float insPen = 0.0f, bool squashInputs = false, vector<size_t> tokensToIgnore = {})
: Base(deviceId, name), m_SubPen(subPen), m_DelPen(delPen), m_InsPen(insPen), m_SquashInputs(squashInputs), m_tokensToIgnore(tokensToIgnore)
{
}
EditDistanceErrorNode(const ScriptableObjects::IConfigRecordPtr configp)
: EditDistanceErrorNode(configp->Get(L"deviceId"), configp->Get(L"subPen"), configp->Get(L"delPen"), configp->Get(L"insPen"), configp->Get(L"squashInputs"), configp->Get(L"samplesToIgnore"), L"<placeholder>")
: EditDistanceErrorNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"subPen"), configp->Get(L"delPen"), configp->Get(L"insPen"), configp->Get(L"squashInputs"), {})
{
AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
}
EditDistanceErrorNode(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name)
{
m_tokensToIgnore = ScriptableObjects::ConfigArray::FlattenedVectorFrom<size_t>(configp->Get(L"tokensToIgnore"));
}
virtual void BackpropToNonLooping(size_t /*inputIndex*/) override
@ -515,7 +511,7 @@ public:
MaskMissingColumnsToZero(*m_maxIndexes0, Input(0)->GetMBLayout(), frameRange);
MaskMissingColumnsToZero(*m_maxIndexes1, Input(1)->GetMBLayout(), frameRange);
Value()(0, 0) = ComputeEditDistanceError(*m_maxIndexes0, *m_maxIndexes1, Input(0)->GetMBLayout(), m_subPen, m_delPen, m_insPen, m_squashInputs, m_SamplesToIgnore);
Value()(0, 0) = ComputeEditDistanceError(*m_maxIndexes0, *m_maxIndexes1, Input(0)->GetMBLayout(), m_SubPen, m_DelPen, m_InsPen, m_SquashInputs, m_tokensToIgnore);
}
virtual void Validate(bool isFinalValidationPass) override
@ -544,11 +540,11 @@ public:
node->m_maxIndexes0 = m_maxIndexes0;
node->m_maxIndexes1 = m_maxIndexes1;
node->m_maxValues = m_maxValues;
node->m_squashInputs = m_squashInputs;
node->m_subPen = m_subPen;
node->m_delPen = m_delPen;
node->m_insPen = m_insPen;
node->m_SamplesToIgnore = m_SamplesToIgnore;
node->m_SquashInputs = m_SquashInputs;
node->m_SubPen = m_SubPen;
node->m_DelPen = m_DelPen;
node->m_InsPen = m_InsPen;
node->m_tokensToIgnore = m_tokensToIgnore;
}
}
@ -578,9 +574,9 @@ public:
// delPen - deletion penalty
// insPen - insertion penalty
// squashInputs - whether to merge sequences of identical samples.
// samplesToIgnore - list of samples to ignore during edit distance evaluation
// tokensToIgnore - list of samples to ignore during edit distance evaluation
static ElemType ComputeEditDistanceError(Matrix<ElemType>& firstSeq, const Matrix<ElemType> & secondSeq, MBLayoutPtr pMBLayout,
float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& samplesToIgnore)
float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& tokensToIgnore)
{
std::vector<int> firstSeqVec, secondSeqVec;
@ -614,8 +610,8 @@ public:
auto columnIndices = pMBLayout->GetColumnIndices(sequence);
ExtractSampleSequence(firstSeq, columnIndices, squashInputs, samplesToIgnore, firstSeqVec);
ExtractSampleSequence(secondSeq, columnIndices, squashInputs, samplesToIgnore, secondSeqVec);
ExtractSampleSequence(firstSeq, columnIndices, squashInputs, tokensToIgnore, firstSeqVec);
ExtractSampleSequence(secondSeq, columnIndices, squashInputs, tokensToIgnore, secondSeqVec);
//calculate edit distance
size_t firstSize = firstSeqVec.size();
@ -690,29 +686,29 @@ public:
return (ElemType)(wrongSampleNum * totalframeNum / totalSampleNum);
}
float SubstitutionPenalty() const { return m_subPen; }
float DeletionPenalty() const { return m_delPen; }
float InsertionPenalty() const { return m_insPen; }
bool SquashInputs() const { return m_squashInputs; }
std::vector<size_t> SamplesToIgnore() const { return m_SamplesToIgnore; }
float SubstitutionPenalty() const { return m_SubPen; }
float DeletionPenalty() const { return m_DelPen; }
float InsertionPenalty() const { return m_InsPen; }
bool SquashInputs() const { return m_SquashInputs; }
std::vector<size_t> TokensToIgnore() const { return m_tokensToIgnore; }
private:
shared_ptr<Matrix<ElemType>> m_maxIndexes0, m_maxIndexes1;
shared_ptr<Matrix<ElemType>> m_maxValues;
bool m_squashInputs;
float m_subPen;
float m_delPen;
float m_insPen;
std::vector<size_t> m_SamplesToIgnore;
bool m_SquashInputs;
float m_SubPen;
float m_DelPen;
float m_InsPen;
std::vector<size_t> m_tokensToIgnore;
// Clear out_SampleSeqVec and extract a vector of samples from the matrix into out_SampleSeqVec.
static void ExtractSampleSequence(const Matrix<ElemType>& firstSeq, vector<size_t>& columnIndices, bool squashInputs, const vector<size_t>& samplesToIgnore, std::vector<int>& out_SampleSeqVec)
static void ExtractSampleSequence(const Matrix<ElemType>& firstSeq, vector<size_t>& columnIndices, bool squashInputs, const vector<size_t>& tokensToIgnore, std::vector<int>& out_SampleSeqVec)
{
out_SampleSeqVec.clear();
// Get the first element in the sequence
size_t lastId = (int)firstSeq(0, columnIndices[0]);
if (std::find(samplesToIgnore.begin(), samplesToIgnore.end(), lastId) == samplesToIgnore.end())
if (std::find(tokensToIgnore.begin(), tokensToIgnore.end(), lastId) == tokensToIgnore.end())
out_SampleSeqVec.push_back(lastId);
// Remaining elements
@ -725,7 +721,7 @@ private:
if (lastId != refId)
{
lastId = refId;
if (std::find(samplesToIgnore.begin(), samplesToIgnore.end(), refId) == samplesToIgnore.end())
if (std::find(tokensToIgnore.begin(), tokensToIgnore.end(), refId) == tokensToIgnore.end())
out_SampleSeqVec.push_back(refId);
}
}
@ -735,7 +731,7 @@ private:
for (size_t i = 1; i < columnIndices.size(); i++)
{
auto refId = (int)firstSeq(0, columnIndices[i]);
if (std::find(samplesToIgnore.begin(), samplesToIgnore.end(), refId) == samplesToIgnore.end())
if (std::find(tokensToIgnore.begin(), tokensToIgnore.end(), refId) == tokensToIgnore.end())
out_SampleSeqVec.push_back(refId);
}
}

Просмотреть файл

@ -8,6 +8,8 @@
#include <string>
#include <stdexcept>
#include <vector>
#include <set>
#include <utility>
#include <algorithm>
#include <stdlib.h>
@ -17,59 +19,238 @@
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
struct MemRequestInfo
{
DEVICEID_TYPE deviceId; // which device to allocate data
shared_ptr<Matrix<ElemType>>*pMatrixPtr; // memory pointer
size_t matrixSize; // memory size
bool mbScale; // whether the memory shall be scaled by minibatch size
int allocStep; // at what step counter memory allocation is requested
int releaseStep; // at what step counter memory release is requested
int memoryId; // integer indexing the memory buffer ID
MemRequestInfo(DEVICEID_TYPE deviceId, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale, int allocStep)
:deviceId(deviceId), pMatrixPtr(pMatrixPtr), matrixSize(matrixSize), mbScale(mbScale), allocStep(allocStep), releaseStep(INT_MAX), memoryId(-1)
{
}
void SetReleaseStep(int step) { releaseStep = step; }
void SetMemoryId(int id) { memoryId = id; }
};
template <class ElemType>
struct greater_than_mem_req_size
{
inline bool operator() (const MemRequestInfo<ElemType>& info1, const MemRequestInfo<ElemType>& info2)
{
return (info1.matrixSize > info2.matrixSize);
}
};
struct MemAllocInfo
{
int memoryId;
size_t memorySize;
vector<pair<int, int>> occupancy;
MemAllocInfo(int memoryId, size_t memorySize, vector<pair<int, int>> occ)
:memoryId(memoryId), memorySize(memorySize), occupancy(occ)
{
}
};
// MatrixPool -- class to support memory sharing
// Despite the gather general name of this class, it is specifically designed to support the memory sharing of ComputationNodes.
// Note: see #define SUPRESS_MEMSHARING below as for how to temporarily disable memory sharing altogether, for debugging
class MatrixPool
{
vector<shared_ptr<Matrix<float>>> m_releasedFloatMatrices;
vector<shared_ptr<Matrix<double>>> m_releasedDoubleMatrices;
vector<MemRequestInfo<float>> m_memRequestInfoFloatVec;
vector<MemRequestInfo<double>> m_memRequestInfoDoubleVec;
set<DEVICEID_TYPE> m_deviceIDSet;
int m_stepCounter;
template <class ElemType>
vector<shared_ptr<Matrix<ElemType>>>& GetReleasedMatrices();
vector<MemRequestInfo<ElemType>>& GetMemRequestInfoVec();
public:
// release here means the matrix can be put back and shared by others
template <class ElemType>
void Release(shared_ptr<Matrix<ElemType>> freeMatrix)
{
if (freeMatrix == nullptr || freeMatrix->GetMatrixType() == SPARSE)
LogicError("MatrixPool::Release: freeMatrix should not be null or sparse.");
//#define SUPRESS_MEMSHARING // #define this to disable memory sharing through this structure
// TODO: Make this a runtime option.
#ifndef SUPRESS_MEMSHARING
vector<shared_ptr<Matrix<ElemType>>>& releasedMatrices = GetReleasedMatrices<ElemType>();
#ifdef _DEBUG
for (int i = 0; i < releasedMatrices.size(); i++)
{
if (releasedMatrices[i] == freeMatrix)
RuntimeError("MatrixPool::Release: freeMatrix is already in the released pool.");
}
void ResetStepCounter() { m_stepCounter = 0; };
#endif
releasedMatrices.push_back(freeMatrix);
#endif
template <class ElemType>
void RequestRelease(shared_ptr<Matrix<ElemType>> *pMatrixPtr)
{
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
// iterate through the vector and find the pointer memInfo
for (auto& memInfo : memInfoVec)
{
if (memInfo.pMatrixPtr == pMatrixPtr)
{
memInfo.SetReleaseStep(m_stepCounter);
break;
}
}
m_stepCounter++;
}
template <class ElemType>
shared_ptr<Matrix<ElemType>> Request(DEVICEID_TYPE deviceId)
void RequestAllocate(DEVICEID_TYPE deviceId, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale)
{
vector<shared_ptr<Matrix<ElemType>>>& releasedMatrices = GetReleasedMatrices<ElemType>();
shared_ptr<Matrix<ElemType>> matrixPtr;
if (releasedMatrices.empty())
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
MemRequestInfo<ElemType> memInfo(deviceId, pMatrixPtr, matrixSize, mbScale, m_stepCounter);
memInfoVec.push_back(memInfo);
m_deviceIDSet.insert(deviceId);
m_stepCounter++;
// assign some temporary pointer, they will be replaced later unless the matrix is sparse
*pMatrixPtr = make_shared<Matrix<ElemType>>(deviceId);
}
void OptimizedMemoryAllocation()
{
// MatrixPool is not templated, so we call both float and double versions here
OptimizedMemoryAllocationFunc<float>();
OptimizedMemoryAllocationFunc<double>();
return;
}
private:
bool CheckOverlap(pair<int, int>occ, vector<pair<int, int>>&occVec)
{
bool bRet = false;
for (auto& o : occVec)
{
matrixPtr = make_shared<Matrix<ElemType>>(deviceId);
if (occ.first <= o.second && occ.second >= o.first)
{
bRet = true;
break;
}
}
else
//#define SUPRESS_MEMSHARING // #define this to disable memory sharing by always return true
// TODO: Make this a runtime option.
#ifdef SUPRESS_MEMSHARING
bRet = true;
#endif
return bRet;
}
template <class ElemType>
void OptimizedMemoryAllocationFunc()
{
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
if (memInfoVec.empty())
return;
// remove all requests that has been marked as sparse matrices, those will not participate in memory sharing
for (auto iter = memInfoVec.begin(); iter != memInfoVec.end(); )
{
matrixPtr = releasedMatrices.back();
releasedMatrices.pop_back();
if ((*(iter->pMatrixPtr))->GetMatrixType() == SPARSE)
memInfoVec.erase(iter);
else
iter++;
}
if (!matrixPtr) // this can't really happen
LogicError("MatrixPool::Request: failed to get a valid matrix.");
// sort the memory request from largest size to smallest
std::sort(memInfoVec.begin(), memInfoVec.end(), greater_than_mem_req_size<ElemType>());
return matrixPtr;
for (auto& devId : m_deviceIDSet)
{
// memAllocInfoVec is a sorted list of memory allocations from smallest to largest in memory size
vector<MemAllocInfo> memAllocInfoVec;
int memoryCounter = 0;
// we start with memory request that is scalable with minibatch size(usually those require larger memory size)
for (auto& memInfo : memInfoVec)
{
// check if it's the proper device
if (memInfo.deviceId != devId || !memInfo.mbScale)
continue;
if (!memAllocInfoVec.empty())
{
// since we assign from highest memory to lowest, every memory that has been allocated can accommodate the
// current memory request, unless there is a conflict (overlap)
auto iter = memAllocInfoVec.begin();
while (iter != memAllocInfoVec.end() && CheckOverlap(make_pair(memInfo.allocStep, memInfo.releaseStep), iter->occupancy))
iter++;
if (iter == memAllocInfoVec.end())
{
// no current memory can be assigned, need to create a new one
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
// insert in the front of the vector to maintain sorted order
memAllocInfoVec.insert(memAllocInfoVec.begin(), ma);
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
else
{
iter->occupancy.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
memInfo.SetMemoryId(iter->memoryId);
}
}
else
{
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
memAllocInfoVec.push_back(ma);
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
}
// rescan the request list and this time allocate for those that doesn't depend on minibatch size
for (auto& memInfo : memInfoVec)
{
// check if it's the proper device
if (memInfo.deviceId != devId || memInfo.mbScale)
continue;
if (!memAllocInfoVec.empty())
{
// the memory allocation vector is sorted by size. We find the largest available buffer that doesn't have time overlap
auto workingAlloc = memAllocInfoVec.end();
for (auto iter = memAllocInfoVec.begin(); iter != memAllocInfoVec.end(); iter++)
{
if (!CheckOverlap(make_pair(memInfo.allocStep, memInfo.releaseStep), iter->occupancy))
workingAlloc = iter;
}
if (workingAlloc == memAllocInfoVec.end()) // nothing works
{
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
memAllocInfoVec.push_back(ma); // add as the last one
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
else
{
workingAlloc->occupancy.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
memInfo.SetMemoryId(workingAlloc->memoryId);
}
}
else
{
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
memAllocInfoVec.push_back(ma);
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
}
// now assign the actual pointers
for (int i = 0; i < memoryCounter; i++)
{
auto matrixPtr = make_shared<Matrix<ElemType>>(devId);
if (!matrixPtr) // this can't really happen, because we haven't started allocating memory yet
LogicError("MatrixPool: failed to get a valid matrix.");
for (auto& memInfo : memInfoVec)
{
if (memInfo.deviceId == devId && memInfo.memoryId == i)
*memInfo.pMatrixPtr = matrixPtr;
}
}
}
}
};

Просмотреть файл

@ -149,6 +149,7 @@ DeclareUnaryElementWiseWithOpCodeNode(Floor, Floor, None,
DeclareUnaryElementWiseWithOpCodeNode(Log, Log, ElementwiseProductWithLogDerivativeFromOutput, binaryWithOutputGradient);
DeclareUnaryElementWiseWithOpCodeNode(Negate, Negate, Negate, unaryGradient);
DeclareUnaryElementWiseWithOpCodeNode(Pass, Copy, Copy, unaryGradient);
DeclareUnaryElementWiseWithOpCodeNode(LabelsToGraph, Copy, Copy, unaryGradient);
DeclareUnaryElementWiseWithOpCodeNode(Reciprocal, Reciprocal, ElementwiseProductWithReciprocalDerivative, binaryWithOutputGradient);
DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear, LinearRectifier, ElementwiseProductWithLinearRectifierDerivativeFromOutput, binaryWithOutputGradient);
DeclareUnaryElementWiseWithOpCodeNode(Sigmoid, Sigmoid, ElementwiseProductWithSigmoidDerivativeFromOutput, binaryWithOutputGradient);

Просмотреть файл

@ -75,11 +75,9 @@ public:
ReleaseMatrixToPool(m_transposedOutput, matrixPool);
ReleaseMatrixToPool(m_transposedDInput, matrixPool);
ReleaseMatrixToPool(m_transposedDOutput, matrixPool);
#if 0
ReleaseMatrixToPool(m_reserve, matrixPool);
ReleaseMatrixToPool(m_workspace, matrixPool);
ReleaseMatrixToPool(m_packingIndex, matrixPool);
#endif
}
virtual bool OutputUsedInComputingInputNodesGradients() const { return false; }

Просмотреть файл

@ -7,6 +7,7 @@
#include "Basics.h"
#include "ComputationNode.h"
#include "gammacalculation.h"
#include "NonlinearityNodes.h"
#include <map>
#include <string>
@ -611,7 +612,7 @@ public:
RequestMatrixFromPool(m_gammaFromLattice, matrixPool);
}
// request matrices needed to do node function value evaluation
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
@ -722,10 +723,7 @@ public:
}
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
return false;
}
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
{
@ -765,4 +763,192 @@ public:
template class DummyCriterionNode<float>;
template class DummyCriterionNode<double>;
// -----------------------------------------------------------------------
// ForwardBackwardNode (graph, prediction, delayConstraint)
// CTC training criterion, primarily based on the paper "Connectionist Temporal Classification: Labelling Unsegmented
// Sequence Data with Recurrent Neural Networks", ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
//
// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference. This using the original time information to enforce that CTC tokens only get aligned within a time margin.
// Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy.
// delayConstraint=-1 means no constraint
// -----------------------------------------------------------------------
template<class ElemType>
class ForwardBackwardNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<2>
{
typedef ComputationNodeNonLooping<ElemType> Base;
UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName()
{
return L"ForwardBackward";
}
public:
DeclareConstructorFromConfigWithNumInputs(ForwardBackwardNode);
ForwardBackwardNode(DEVICEID_TYPE deviceId, const wstring & name, int blankTokenId=INT_MIN, int delayConstraint=-1) :
Base(deviceId, name), m_blankTokenId(blankTokenId), m_delayConstraint(delayConstraint)
{
}
// Compute gradients to input observations, the weights to the observations, and the class log posterior probabilites
virtual void BackpropToNonLooping(size_t inputIndex) override
{
// Left node must be a scalar
if (inputIndex == 0) //left derivative
{
BackpropToLeft(*m_logSoftmaxOfRight, InputRef(inputIndex).Gradient(), Gradient());
}
else if (inputIndex == 1)
{
FrameRange frameRange(InputRef(0).GetMBLayout());
BackpropToRight(*m_softmaxOfRight, InputRef(inputIndex).Gradient(), Gradient(), *m_CTCposterior);
InputRef(inputIndex).MaskMissingGradientColumnsToZero(frameRange);
}
else
RuntimeError("ForwardBackwardNode criterion expects only two inputs: labels and network output.");
}
void BackpropToLeft(const Matrix<ElemType>& logSoftmaxOfRight, Matrix<ElemType>& inputGradientValues,
const Matrix<ElemType>& gradientValues)
{
#if DUMPOUTPUT
logSoftmaxOfRight.Print("ForwardBackwardNode Partial-logSoftmaxOfRight");
gradientValues.Print("ForwardBackwardNode Partial-gradientValues");
inputGradientValues.Print("ForwardBackwardNode Partial-Left-in");
#endif
Matrix<ElemType>::ScaleAndAdd(-gradientValues.Get00Element(), logSoftmaxOfRight, inputGradientValues);
#if DUMPOUTPUT
inputGradientValues.Print("ForwardBackwardNode Partial-Left-out");
#endif
}
void BackpropToRight(const Matrix<ElemType>& softmaxOfRight, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues,
const Matrix<ElemType> &CTCposterior)
{
#if DUMPOUTPUT
softmaxOfRight.Print("ForwardBackwardNode Partial-softmaxOfRight");
inputFunctionValues.Print("ForwardBackwardNode Partial-inputFunctionValues");
gradientValues.Print("ForwardBackwardNode Partial-gradientValues");
inputGradientValues.Print("ForwardBackwardNode Partial-Right-in");
#endif
// inputGradientValues+= gradientValues*(softmaxOfRight - CTCposterior)
Matrix<ElemType>::AddScaledDifference(gradientValues, softmaxOfRight, CTCposterior, inputGradientValues);
#if DUMPOUTPUT
inputGradientValues.Print("ForwardBackwardNode Partial-Right");
#endif
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
return false;
}
virtual void ForwardPropNonLooping() override
{
m_logSoftmaxOfRight->AssignLogSoftmaxOf(InputRef(1).Value(), true);
m_softmaxOfRight->SetValue(*m_logSoftmaxOfRight);
m_softmaxOfRight->InplaceExp();
m_CTCposterior->SwitchToMatrixType(m_softmaxOfRight->GetMatrixType(), m_softmaxOfRight->GetFormat(), false);
m_CTCposterior->Resize(m_softmaxOfRight->GetNumRows(), m_softmaxOfRight->GetNumCols());
FrameRange fr(InputRef(0).GetMBLayout());
InputRef(0).ValueFor(fr).VectorMax(*m_maxIndexes, *m_maxValues, true);
// compute CTC score
m_GammaCal.doCTC(Value(), *m_logSoftmaxOfRight, *m_maxIndexes, *m_maxValues, *m_CTCposterior, InputRef(0).GetMBLayout(), m_blankTokenId, m_delayConstraint);
#if NANCHECK
functionValues.HasNan("ForwardBackwardNode");
#endif
#if DUMPOUTPUT
functionValues.Print("ForwardBackwardNode");
#endif
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
m_pMBLayout = nullptr; // no layout
if (isFinalValidationPass)
{
if (!(Input(0)->GetSampleMatrixNumRows() == Input(1)->GetSampleMatrixNumRows() && // match vector dimension
Input(0)->HasMBLayout() &&
Input(0)->GetMBLayout() == Input(1)->GetMBLayout()))
{
LogicError("The Matrix dimension in the ForwardBackwardNode operation does not match.");
}
auto leftNode = dynamic_pointer_cast<LabelsToGraphNode<ElemType>>(Input(0));
if (!leftNode)
LogicError("ForwardBackwardNode: Please pass LabelsToGraph(labels) for second argument");
}
SetDims(TensorShape(1), false);
}
virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
{
Base::CopyTo(nodeP, newName, flags);
if (flags & CopyNodeFlags::copyNodeValue)
{
auto node = dynamic_pointer_cast<ForwardBackwardNode<ElemType>>(nodeP);
node->m_logSoftmaxOfRight->SetValue(*m_logSoftmaxOfRight);
node->m_softmaxOfRight->SetValue(*m_softmaxOfRight);
node->m_CTCposterior->SetValue(*m_CTCposterior);
node->m_maxIndexes->SetValue(*m_maxIndexes);
node->m_maxValues->SetValue(*m_maxValues);
node->m_delayConstraint = m_delayConstraint;
}
}
// request matrices needed to do node function value evaluation
virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool)
{
Base::RequestMatricesBeforeForwardProp(matrixPool);
RequestMatrixFromPool(m_logSoftmaxOfRight, matrixPool);
RequestMatrixFromPool(m_softmaxOfRight, matrixPool);
RequestMatrixFromPool(m_CTCposterior, matrixPool);
RequestMatrixFromPool(m_maxIndexes, matrixPool);
RequestMatrixFromPool(m_maxValues, matrixPool);
}
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
ReleaseMatrixToPool(m_CTCposterior, matrixPool);
ReleaseMatrixToPool(m_maxIndexes, matrixPool);
ReleaseMatrixToPool(m_maxValues, matrixPool);
}
virtual void UpdateFunctionMBSize() override
{
Base::UpdateFunctionMBSize();
size_t cols = Input(0)->Value().GetNumCols();
m_maxIndexes->Resize(1, cols);
m_maxValues->Resize(1, cols);
}
protected:
virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
shared_ptr<Matrix<ElemType>> m_logSoftmaxOfRight;
shared_ptr<Matrix<ElemType>> m_softmaxOfRight;
shared_ptr<Matrix<ElemType>> m_CTCposterior;
shared_ptr<Matrix<ElemType>> m_maxIndexes;
shared_ptr<Matrix<ElemType>> m_maxValues;
msra::lattices::GammaCalculation<ElemType> m_GammaCal;
int m_blankTokenId;
int m_delayConstraint;
};
template class ForwardBackwardNode<float>;
template class ForwardBackwardNode<double>;
} } }

Просмотреть файл

@ -219,6 +219,14 @@ public:
RequestMatrixFromPool(m_softmaxOfRight, matrixPool);
}
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
}
protected:
shared_ptr<Matrix<ElemType>> m_logSoftmaxOfRight;
shared_ptr<Matrix<ElemType>> m_softmaxOfRight;

Просмотреть файл

@ -41,7 +41,6 @@ void CNTKEvalBase<ElemType>::Init(const std::string& config)
CPUMatrix<ElemType>::SetNumThreads(nThreads);
Globals::SetShareNodeValueMatrices(m_config(L"shareNodeValueMatrices", true));
Globals::SetHyperCompressMemory(m_config(L"hyperCompressMemory", false));
}

Просмотреть файл

@ -60,8 +60,8 @@
</ClCompile>
<Link>
<AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
<AdditionalDependencies>EvalDLL.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<DelayLoadDLLs>EvalDll.dll</DelayLoadDLLs>
<AdditionalDependencies>EvalDLL.lib;Math.lib;Common.lib;$(MSMPI_LIB64)msmpi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<DelayLoadDLLs>EvalDll.dll;Math.dll</DelayLoadDLLs>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(DebugBuild)">

Просмотреть файл

@ -5873,6 +5873,166 @@ void CPUMatrix<ElemType>::RCRFBackwardCompute(const CPUMatrix<ElemType>& alpha,
}
};
template<class ElemType>
CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignCTCScore(
const CPUMatrix<ElemType>& prob, CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta,
const CPUMatrix<ElemType>& phoneSeq, const CPUMatrix<ElemType>& phoneBoundary, ElemType &totalScore, const std::vector<size_t>& uttMap, const std::vector<size_t> & uttBeginFrame, const std::vector<size_t> & uttFrameNum,
const std::vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise)
{
// Column wise representation of sequences in input matrices (each column is one sequence/utterance)
if (isColWise)
{
vector<size_t> curPhoneSeq;
auto &us = *this;
size_t s, s2;
size_t senoneid, t;
ElemType ascore;
double x, y;
size_t senonenum, frameNum;
for (size_t uttId = 0;uttId < uttFrameNum.size(); uttId++) {
senonenum = uttPhoneNum[uttId];
frameNum = uttFrameNum[uttId];
// Populate utterance
// Using loop instead of memcpy for clarity
curPhoneSeq.reserve(senonenum);
for (size_t i =0;i < senonenum;i++)
curPhoneSeq.push_back(phoneSeq(i, uttId));
if (frameNum > 1)
{
//initialize alpha
for (s = 1; s < 3; s++)
{
senoneid = curPhoneSeq[s];
alpha(s, 0) = prob(senoneid, 0);
}
alpha(senonenum - 1, 0) = LZERO;
//initialize beta
for (s = senonenum - 3; s < senonenum - 1; s++)
{
senoneid = curPhoneSeq[s];
beta(s, frameNum - 1) = prob(senoneid, frameNum - 1);
}
beta(senonenum - 1, frameNum - 1) = LZERO;
//cal alpha
for (t = 1; t < frameNum; t++)
{
for (s = 1; s < senonenum - 1; s++)
{
senoneid = curPhoneSeq[s];
x = LZERO;
for (s2 = s - 1; s2 <= s; s2++)
{
if (s2 > 0)
{
y = alpha(s2, t - 1);
x = LogAddD(x, y);
}
}
if (senoneid != prob.GetNumRows() - 1 && s - 2 > 0 && senoneid != curPhoneSeq[s - 2])
{
y = alpha(s - 2, t - 1);
x = LogAddD(x, y);
}
if (senoneid != SIZE_MAX)
ascore = prob(senoneid, t);
else
ascore = 0;
alpha(s, t) = (float)x + ascore;
}
}
//exit senone
x = LZERO;
for (s2 = senonenum - 3; s2 < senonenum - 1; s2++)
{
y = alpha(s2, frameNum - 1);
x = LogAddD(x, y);
}
alpha(senonenum - 1, frameNum - 1) = (float)x;
totalScore = -alpha(senonenum - 1, frameNum - 1);
//cal beta
for (t = frameNum - 2; t >= 0; t--)
{
for (s = 1; s < senonenum - 1; s++)
{
senoneid = curPhoneSeq[s];
x = LZERO;
for (s2 = s; s2 <= s + 1; s2++)
{
if (s2 < senonenum - 1)
{
y = beta(s2, t + 1);
x = LogAddD(x, y);
}
}
if (senoneid != prob.GetNumRows() - 1 && s + 2 < senonenum - 1 && senoneid != curPhoneSeq[s + 2])
{
y = beta(s + 2, t + 1);
x = LogAddD(x, y);
}
if (senoneid != SIZE_MAX)
ascore = prob(senoneid, t);
else
ascore = 0;
beta(s, t) = (float)x + ascore;
}
if (t == 0)
break;
}
//entry senone
x = LZERO;
for (s2 = 1; s2 < 3; s2++)
{
y = beta(s2, 0);
x = LogAddD(x, y);
}
beta(0, 0) = (float)x;
for (t = 0; t < frameNum; t++)
{
//cal zt
double Zt = LZERO;
for (s = 1; s < senonenum - 1; s++)
{
senoneid = curPhoneSeq[s];
Zt = LogAddD(Zt, (alpha(s, t) + beta(s, t) - prob(senoneid, t)));
}
for (s = 1; s < senonenum - 1; s++)
{
senoneid = curPhoneSeq[s];
if (senoneid != SIZE_MAX)
{
ElemType logoccu = alpha(s, t) + beta(s, t) - prob(senoneid, t) - (float)Zt;
if (logoccu < LOG_OF_EPS_IN_LOG)
us(senoneid, t) += 0.0f;
else
us(senoneid, t) += exp(logoccu);
}
}
}
}
}
return *this;
}
else {
LogicError("Only ColWise minibatch layout is supported.");
}
return *this;
}
/// the kernel function for RCRF backward computation
template <class ElemType>
void CPUMatrix<ElemType>::_rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,

Просмотреть файл

@ -231,6 +231,7 @@ public:
// sequence training
CPUMatrix<ElemType>& DropFrame(const CPUMatrix<ElemType>& label, const CPUMatrix<ElemType>& gamma, const ElemType& threshhold);
CPUMatrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const CPUMatrix<ElemType>& label, const CPUMatrix<ElemType>& dnnoutput, const CPUMatrix<ElemType>& gamma, ElemType alpha);
CPUMatrix<ElemType>& AssignCTCScore(const CPUMatrix<ElemType>& prob, CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta, const CPUMatrix<ElemType>& phoneSeq, const CPUMatrix<ElemType>& phoneBoundary, ElemType &totalScore, const vector<size_t>& uttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise);
CPUMatrix<ElemType>& InplaceSqrt();
CPUMatrix<ElemType>& AssignSqrtOf(const CPUMatrix<ElemType>& a);

Просмотреть файл

@ -41,8 +41,6 @@ typedef unsigned char byte;
#define GPUSPARSE_INDEX_TYPE int // cuSparse only supports int array indexes
#define CPUSPARSE_INDEX_TYPE int // to be consistent with cuSparse but limited the possible size of the matrix.
#define MEM_MAX_LIMIT_TIMES 2 // The maximum times allowed a cached memory block allocated to a request
namespace Microsoft { namespace MSR { namespace CNTK {
MATH_API void SetMathLibTraceLevel(int traceLevel);
@ -214,158 +212,6 @@ enum MatrixFlags
matrixFlagSetValueOnDevice = 1 << bitPosSetValueOnDevice, // SetValue() call has a buffer that is already on the device
};
// -----------------------------------------------------------------------
// BufferManagement -- to control the allocation and release of memory
//
// 1. The goal of buffer management
// The best way to save memory is releasing memory right after no longer used in the rest of the mini-batch, which makes
// the extra cost on memory operation and slows down the speed. An option to solve that is building the static link between
// all nodes in pre-computing process and making memory re-use in the runtime, known as shared node value matrices in CNTK.
// The other option is using a buffer pool to take over the allocation and release request. Whereas the physical operation on
// memory, logical operation will make nearly no cost on allocation or release. Since the second option, achieved as
// BufferManagement below, could control all the memory operation, including some trivial ones, like the workspace in convolutions,
// and more flexible, allocating based on size and being easy to implement new algorithm, it is usually more powerful than the
// first method.
// 2. How it works?
// First, it should be called in Resize function. In Resize function, using Request and LogicalReleaseFunction to replace the original
// request and release functions. Since BufferManagement is singleton for deviceId, just call the GetManagementInstance. And in Resize,
// there is a flag named growthOnly, which will request only the size increases to save the allocation cost. In the case, since the
// buffer pool, nearly no cost on allocation, the growth only will be disable in BufferManagement mode.
// -----------------------------------------------------------------------
class BufferManagement
{
private:
BufferManagement() = default;
// Disable all the copy & move functions to keep the instance safely
DISABLE_COPY_AND_MOVE(BufferManagement);
public:
static BufferManagement& GetManagerInstance(DEVICEID_TYPE deviceId)
{
static std::mutex instancLock;
auto instance = m_instances.find(deviceId);
if (instance == m_instances.end())
{
std::lock_guard<std::mutex> lock(instancLock);
if (instance == m_instances.end())
{
instance = m_instances.insert(std::make_pair(deviceId, std::unique_ptr<BufferManagement>(
new BufferManagement()))).first;
instance->second->m_deviceId = deviceId;
instance->second->m_totalManageSize = 0;
instance->second->m_totalAllocSize = 0;
}
}
return *(instance->second);
}
// for requesting, find in buffer container first, if failed, allocate a new one
// if allocating from buffer, the size will be modified to the real buffer size
template<class ElemType>
ElemType* RequestBuffer(size_t& size)
{
ElemType* bufferPtr = nullptr;
auto& bufferContainer = BufferContainer<ElemType>();
// simply allocating based on size, more efficient and complex algorithm could be implemented here
auto bufferHint = bufferContainer.lower_bound(size);
if (bufferHint != bufferContainer.end() && bufferHint->first < size * MEM_MAX_LIMIT_TIMES)
{
bufferPtr = bufferHint->second;
size = bufferHint->first;
m_totalManageSize -= size;
bufferContainer.erase(bufferHint);
return bufferPtr;
}
if (m_deviceId >= 0) {
#ifndef CPUONLY
auto deviceSize = TracingGPUMemoryAllocator::GetFreeAndTotalMemoryInMBs(m_deviceId);
float freeMemoryRatio = (float)deviceSize.first / deviceSize.second;
if (freeMemoryRatio < 0.05f || (deviceSize.first << 20) / sizeof(ElemType) < size)
{
PhysicalReleaseAllBuffer<ElemType>();
}
bufferPtr = TracingGPUMemoryAllocator::Allocate<ElemType>(m_deviceId, size);
m_totalAllocSize += size;
#endif
}
else
{
// first, try no-throw allocation.
// if failed, empty the buffer and re-try a throwing allocation
// if failed again, let system throw the bad_alloc exception
bufferPtr = new (std::nothrow) ElemType[size];
if (!bufferPtr)
{
PhysicalReleaseAllBuffer<ElemType>();
bufferPtr = new ElemType[size];
}
m_totalAllocSize += size;
}
return bufferPtr;
}
// insert the header of buffer into the buffer container
template<class ElemType>
void LogicalReleaseBuffer(ElemType* buffer, size_t size)
{
auto& bufferContainer = BufferContainer<ElemType>();
bufferContainer.insert(std::make_pair(size, buffer));
m_totalManageSize += size;
}
// physical release the buffer
template<class ElemType>
void PhysicalReleaseBuffer(ElemType* buffer)
{
if (m_deviceId >= 0)
{
#ifndef CPUONLY
TracingGPUMemoryAllocator::Free<ElemType>(m_deviceId, buffer, false);
#endif
}
else {
delete[] buffer;
}
}
// empty all the cached buffer
template<class ElemType>
void PhysicalReleaseAllBuffer()
{
auto& bufferContainer = BufferContainer<ElemType>();
for (auto& iter : bufferContainer)
{
PhysicalReleaseBuffer<ElemType>(iter.second);
}
bufferContainer.clear();
m_totalManageSize = 0;
}
private:
static std::unordered_map<DEVICEID_TYPE, std::unique_ptr<BufferManagement>> m_instances;
template <class ElemType>
std::multimap<size_t, ElemType*>& BufferContainer();
DEVICEID_TYPE m_deviceId;
size_t m_totalManageSize;
size_t m_totalAllocSize;
// map to store all the temp buffer handle
std::multimap<size_t, float*> m_bufferFloatContainer;
std::multimap<size_t, double*> m_bufferDoubleContainer;
std::multimap<size_t, char*> m_bufferCharContainer;
std::multimap<size_t, short*> m_bufferShortContainer;
std::multimap<size_t, int*> m_bufferIntContainer;
};
// -----------------------------------------------------------------------
// BaseMatrixStorage -- base class for all matrix types (CPU, GPU) x (dense, sparse)
// -----------------------------------------------------------------------

Просмотреть файл

@ -260,8 +260,6 @@ protected:
}
// Only supported in MatrixPool enable
// NOTE: it's unnecessary to keep the workspace.
workspace.Resize(0, 0);
CUDNN_CALL(err);
}
@ -304,7 +302,6 @@ protected:
if (CUDNN_STATUS_SUCCESS == err2)
err = CUDNN_STATUS_SUCCESS;
}
workspace.Resize(0, 0);
CUDNN_CALL(err);
}
@ -347,7 +344,6 @@ protected:
if (CUDNN_STATUS_SUCCESS == err2)
err = CUDNN_STATUS_SUCCESS;
}
workspace.Resize(0, 0);
CUDNN_CALL(err);
}

Просмотреть файл

@ -1531,42 +1531,35 @@ void GPUMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
}
template <class ElemType>
void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly)
{
if (GetNumRows() != numRows || GetNumCols() != numCols)
Resize(numRows, numCols, growOnly, cachedResize);
Resize(numRows, numCols, growOnly);
}
template <class ElemType>
void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly)
{
if (GetNumRows() == numRows && GetNumCols() == numCols)
return;
VerifyResizable(__FUNCTION__);
bool isForceResize = (!growOnly) || cachedResize;
size_t numElements = numRows * numCols;
if (numElements > GetSizeAllocated() || // grow allocation
(isForceResize && numElements != GetSizeAllocated())) // shrink allocation if not growOnly
(!growOnly && numElements != GetSizeAllocated())) // shrink allocation if not growOnly
{
// If the buffer exists, free it before allocate
if (Buffer())
{
if (cachedResize)
BufferManagement::GetManagerInstance(GetComputeDeviceId()).LogicalReleaseBuffer<ElemType>(Buffer(), GetSizeAllocated());
else
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
}
// reallocate buffer if numElements > 0
ElemType* pArray = nullptr;
if (numElements > 0)
{
if (cachedResize)
pArray = BufferManagement::GetManagerInstance(GetComputeDeviceId()).RequestBuffer<ElemType>(numElements);
else
pArray = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), numRows, numCols);
pArray = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), numRows, numCols);
}
SetBuffer(pArray, numElements * sizeof(ElemType));
@ -2374,7 +2367,9 @@ ElemType GPUMatrix<ElemType>::AbsoluteMax() const
int resInd = 0;
cublasIdamax(cuHandle, (CUDA_LONG)GetNumElements(), reinterpret_cast<double*>(Data()), 1, &resInd);
resInd--;
CUDA_CALL(cudaMemcpy(reinterpret_cast<double*>(&res), Data() + resInd, sizeof(double), cudaMemcpyDeviceToHost));
return res;
}
}
@ -2951,7 +2946,30 @@ void GPUMatrix<ElemType>::Print(const char* /*matrixName*/, size_t /*rowStart*/,
template <class ElemType>
void GPUMatrix<ElemType>::Print(const char* matrixName /*=nullptr*/) const
{
Print(matrixName, 0, GetNumRows() - 1, 0, GetNumCols() - 1);
size_t elemCount = GetNumRows() * GetNumCols();
vector<ElemType> localCopy(elemCount);
cudaMemcpy(localCopy.data(), Data(), elemCount * sizeof(ElemType), cudaMemcpyDeviceToHost);
fprintf(stderr, "\n###### ");
if (matrixName != nullptr)
fprintf(stderr, "%s ", matrixName);
fprintf(stderr, "(%lu, %lu) ######\n\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols());
if (IsEmpty())
{
fprintf(stderr, "(empty)\n");
return;
}
// CNTK is using column-major storage
for (size_t i = 0; i < GetNumRows(); i++)
{
for (size_t j = 0; j < GetNumCols(); j++)
{
fprintf(stderr, "%.10f\t", localCopy[i + j * GetNumRows()]);
}
fprintf(stderr, "\n");
}
}
//helpfer function used for convolution neural network
@ -4253,6 +4271,117 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::GetARowByIndex(const GPUMatrix<ElemTyp
return *this;
}
// Calculate CTC score
// prob (input): the posterior output from the network
// alpha, beta (output): alpha and beta for forward-backward calculation.
// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance
// phoneBoundary (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance
// totalScore (output): total CTC score
// uttToChanInd (input): map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
// uttBeginFrame(input): the positon of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance.
// uttFrameNum (input): the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch
// uttPhoneNum (input): the phone number of each utterance. The size of this vector = the number of all utterances in this minibatch
// numParallelSequences (input): channel number in this minibatch
// maxFrameNum (input): the maximum channel frame number
// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference.
// Alpha and Beta scores outside of the delay boundary are set to zero.
// Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy
// delayConstraint=-1 means no constraint
template<class ElemType>
GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignCTCScore(const GPUMatrix<ElemType>& prob,
GPUMatrix<ElemType>& alpha,
GPUMatrix<ElemType>& beta,
const GPUMatrix<ElemType> phoneSeq,
const GPUMatrix<ElemType> phoneBoundary,
ElemType &totalScore,
const std::vector<size_t>& uttToChanInd,
const std::vector<size_t> & uttBeginFrame,
const std::vector<size_t> & uttFrameNum,
const std::vector<size_t> & uttPhoneNum,
const size_t numParallelSequences,
const size_t maxFrameNum, const int delayConstraint, const bool isColWise)
{
if (isColWise)
{
PrepareDevice();
// Total number of phones
long totalPhoneNum = prob.GetNumRows();
size_t uttNum = uttFrameNum.size();
// Max number of phones in utterances in this minibatch
size_t maxPhoneNum = phoneSeq.GetNumRows();
size_t *gpuFrameNum;
CUDA_CALL(cudaMalloc((void **)&gpuFrameNum, uttNum * sizeof(size_t)));
CUDA_CALL(cudaMemcpy(gpuFrameNum, uttFrameNum.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
size_t *gpuPhoneNum;
CUDA_CALL(cudaMalloc((void **)&gpuPhoneNum, uttNum * sizeof(size_t)));
CUDA_CALL(cudaMemcpy(gpuPhoneNum, uttPhoneNum.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
size_t *gpuBeginFrame;
CUDA_CALL(cudaMalloc((void **)&gpuBeginFrame, uttNum * sizeof(size_t)));
CUDA_CALL(cudaMemcpy(gpuBeginFrame, uttBeginFrame.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
size_t *gpuUttToChanInd;
CUDA_CALL(cudaMalloc((void **)&gpuUttToChanInd, uttNum * sizeof(size_t)));
CUDA_CALL(cudaMemcpy(gpuUttToChanInd, uttToChanInd.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
ElemType *gpuScores;
CUDA_CALL(cudaMalloc((void **)&gpuScores, uttNum * sizeof(ElemType)));
cudaEvent_t done = nullptr;
CUDA_CALL(cudaEventCreate(&done));
dim3 thread_tail(DEFAULT_THREAD_PER_DIM, DEFAULT_THREAD_PER_DIM);
// x dimension is for utterances
// y dimention is for phone sequence in each utterance
// Ensure that we allocate correct number of blocks for given number of utterances and max number of phones in those utterances
dim3 block_tail((uttNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM, (maxPhoneNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM);
for (long t = 0; t < maxFrameNum; t++)
{
_assignAlphaScore << <block_tail, thread_tail, 0, t_stream >> >(prob.Data(), alpha.Data(), phoneSeq.Data(), phoneBoundary.Data(), gpuUttToChanInd,
gpuFrameNum, gpuBeginFrame, gpuPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, delayConstraint);
}
for (long t = maxFrameNum - 1; t >= 0; t--)
{
_assignBetaScore << <block_tail, thread_tail, 0, t_stream >> >(prob.Data(), beta.Data(), phoneSeq.Data(), phoneBoundary.Data(), gpuUttToChanInd,
gpuFrameNum, gpuBeginFrame, gpuPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, delayConstraint);
}
_assignTotalScore << <uttNum, 1, 0, t_stream >> > (beta.Data(), gpuScores, uttNum, gpuUttToChanInd, gpuBeginFrame, numParallelSequences, maxPhoneNum);
dim3 block_tail_2((uttNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM, (maxFrameNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM);
_assignCTCScore << < block_tail_2, thread_tail, 0, t_stream >> >(Data(), prob.Data(), alpha.Data(), beta.Data(), phoneSeq.Data(), uttNum, gpuUttToChanInd,
gpuBeginFrame, gpuPhoneNum, gpuFrameNum, numParallelSequences, maxPhoneNum, totalPhoneNum);
vector<ElemType>scores(uttNum);
CUDA_CALL(cudaMemcpyAsync(scores.data(), gpuScores, sizeof(ElemType) * uttNum, cudaMemcpyDeviceToHost, t_stream));
for (size_t utt = 0; utt < uttFrameNum.size(); utt++)
{
totalScore += scores[utt];
}
CUDA_CALL(cudaFree(gpuFrameNum));
CUDA_CALL(cudaFree(gpuPhoneNum));
CUDA_CALL(cudaFree(gpuBeginFrame));
CUDA_CALL(cudaFree(gpuUttToChanInd));
CUDA_CALL(cudaFree(gpuScores));
CUDA_CALL(cudaEventRecord(done));
CUDA_CALL(cudaEventSynchronize(done));
CUDA_CALL(cudaEventDestroy(done));
}
else
{
NOT_IMPLEMENTED;
}
return *this;
}
template <class ElemType>
void GPUMatrix<ElemType>::ConductRowElementMultiplyWithShift(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c, const size_t shift, const bool isafixed)
{
@ -4613,8 +4742,8 @@ template GPUMatrix<char>::GPUMatrix(const GPUMatrix<char>&);
template GPUMatrix<char>::GPUMatrix(GPUMatrix<char>&&);
template char* GPUMatrix<char>::CopyToArray() const;
template void GPUMatrix<char>::ChangeDeviceTo(int);
template void GPUMatrix<char>::Resize(size_t, size_t, bool, bool);
template void GPUMatrix<char>::RequireSize(size_t, size_t, bool, bool);
template void GPUMatrix<char>::Resize(size_t, size_t, bool);
template void GPUMatrix<char>::RequireSize(size_t, size_t, bool);
template GPUMatrix<char>::~GPUMatrix();
template GPUMatrix<char> GPUMatrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
@ -4638,8 +4767,8 @@ template GPUMatrix<short>::GPUMatrix(const GPUMatrix<short>&);
template GPUMatrix<short>::GPUMatrix(GPUMatrix<short>&&);
template short* GPUMatrix<short>::CopyToArray() const;
template void GPUMatrix<short>::ChangeDeviceTo(int);
template void GPUMatrix<short>::Resize(size_t, size_t, bool, bool);
template void GPUMatrix<short>::RequireSize(size_t, size_t, bool, bool);
template void GPUMatrix<short>::Resize(size_t, size_t, bool);
template void GPUMatrix<short>::RequireSize(size_t, size_t, bool);
template GPUMatrix<short>::~GPUMatrix();
template GPUMatrix<short> GPUMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;

Просмотреть файл

@ -244,12 +244,12 @@ public:
// RequireSize is now the new preferred method of ensuring the correct size inside of the Matrix class. Since Resize will fail if the storage object has
// multiple views, RequireSize will first check to see if Resize is required. If it is not, then it short-circuits and is a noop. Otherwise, RequireSize
// will call Resize, which may fail if the matrix has multiple views.
void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true, bool cachedResize = false); // by default we only reallocate if need to grow
void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true, bool cachedResize = false) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly, cachedResize); }
void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly); }
// Resize first checks to ensure that the caller has the authority to call Resize (i.e., it checks to ensure the underlying data is owned by only this matrix), and then
// actually resizes the underlying matrix, doing any allocation as required.
void Resize(const size_t numRows, const size_t numCols, bool growOnly = true, bool cachedResize = false); // by default we only reallocate if need to grow
void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
ElemType& operator()(const size_t /*row*/, const size_t /*col*/) { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
@ -349,6 +349,10 @@ public:
GPUMatrix<ElemType>& DropFrame(const GPUMatrix<ElemType>& label, const GPUMatrix<ElemType>& gamma, const ElemType& threshhold);
GPUMatrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const GPUMatrix<ElemType>& label, const GPUMatrix<ElemType>& dnnoutput, const GPUMatrix<ElemType>& gamma, ElemType alpha);
GPUMatrix<ElemType>& AssignCTCScore(const GPUMatrix<ElemType>& prob, GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& beta,
const GPUMatrix<ElemType> phoneSeq, const GPUMatrix<ElemType> phoneBoundary, ElemType &totalScore, const vector<size_t>& uttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum,
const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise);
GPUMatrix<ElemType>& InplaceSqrt();
GPUMatrix<ElemType>& AssignSqrtOf(const GPUMatrix<ElemType>& a);

Просмотреть файл

@ -5192,6 +5192,292 @@ __global__ void _adam4BlockSparseCol(CUDA_LONG size,
val[idx] -= g;
}
}
// calculate alpha in forward-backward calculation. equation (6), (7) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
// Calculate alpha in forward-backward calculation. equation (6), (7) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
// GPU x dimension corresponds to utterances, y dimension corresponds to phone sequence in each utterance
// prob (input): the posterior output from the network
// alpha (output): alpha for forward-backward calculation.
// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance
// phoneBound (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance
// uttToChanInd (input): map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
// uttFrameNum (input): the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch
// uttBeginFrame(input): the positon of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance.
// uttPhoneNum (input): the phone number of each utterance. The size of this vector = the number of all utterances in this minibatch
// numChannels (input): channel number in this minibatch
// uttNum (input): number of utterances
// t (input): time stamp to process
// maxPhoneNum (input): the max number of phones between utterances
// totalPhoneNum (input): the total number of phones of all utterances
// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference.
// Alpha and Beta scores outside of the delay boundary are set to zero.
// Setting this parameter smaller will result in shorted delay between label output during decoding.
// delayConstraint=-1 means no constraint
template<class ElemType>
__global__ void _assignAlphaScore(
const ElemType *prob,
ElemType *alphaScore,
ElemType *phoneSeq,
ElemType *phoneBound,
const size_t *uttToChanInd,
const size_t *uttFrameNum,
const size_t *uttBeginFrame,
const size_t *uttPhoneNum,
size_t numChannels,
const size_t uttNum,
const size_t t,
const size_t maxPhoneNum, // Maximum length of utterance in this MB
const size_t totalPhoneNum, // Total number of phones
const int delayConstraint)
{
LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
// Index of the label in the sequence
LONG64 phoneSeqId = blockDim.y * blockIdx.y + threadIdx.y;
// Number of phones and frames in this utterance
LONG64 phoneNum = uttPhoneNum[uttId];
LONG64 frameNum = uttFrameNum[uttId];
if (uttId >= uttNum || phoneSeqId >= phoneNum - 1 || t >= frameNum || phoneSeqId == 0) return;
// Current and previous phone indices in phoneSeq matrix
LONG64 labelid = uttId*maxPhoneNum + phoneSeqId;
LONG64 labelid_2 = labelid - 2;
// Actual current phone label
LONG64 phoneId = (LONG64)(phoneSeq[labelid]);
// Index of the current frame in minibatch
LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];
// Index of probability of observing phoneId at frame timeId
LONG64 probId = timeId*totalPhoneNum + phoneId;
LONG64 alphaId = maxPhoneNum* timeId + phoneSeqId; // alpha_t(s)
// Previous time frame
LONG64 timeId_1 = timeId - numChannels; // Index corresponding to (t-1)
LONG64 alphaId_0 = maxPhoneNum* timeId_1 + phoneSeqId; // alpha_{t-1}(s)
LONG64 alphaId_1 = alphaId_0 - 1; // alpha_{t-1}(s-1)
LONG64 alphaId_2 = alphaId_0 - 2; // alpha_{t-1}(s-2)
if (t == 0)
{
// Initialize recursion
if (phoneSeqId == 1 || phoneSeqId == 2)
{
alphaScore[alphaId] = prob[probId];
}
}
else
{
if (phoneSeqId >= 1)
{
ElemType x = LZERO;
ElemType ascore;
if (phoneSeqId > 2)
{
// if current label is not blank and not equal prev non-blank label
if ((LONG64)(phoneSeq[labelid]) != totalPhoneNum - 1 && phoneId != (LONG64)(phoneSeq[labelid_2]))
{
x = logaddk(x, alphaScore[alphaId_2]);
}
}
if (phoneSeqId > 1)
{
x = logaddk(x, alphaScore[alphaId_1]);
}
x = logaddk(x, alphaScore[alphaId_0]);
if (phoneId != SIZE_MAX)
ascore = prob[probId]; // Probability of observing given label at given time
else
ascore = 0;
alphaScore[alphaId] = (ElemType)x + ascore;
if (delayConstraint != -1)
{
LONG64 labelid_r = labelid + 2;
LONG64 phoneBoundId_r = (LONG64)(phoneBound[labelid_r]);
if (phoneId == totalPhoneNum - 1)
{
// only constraint right side
if (t > phoneBoundId_r + delayConstraint - 1)
alphaScore[alphaId] = LZERO;
}
else if (phoneId != totalPhoneNum - 1)
{
if (t > phoneBoundId_r + delayConstraint)
alphaScore[alphaId] = LZERO;
}
}
}
}
}
// Calculate beta in forward-backward calculation, equation (10), (11) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
// See _assignAlphaScore for the explanation of parameters
template<class ElemType>
__global__ void _assignBetaScore(
const ElemType *prob,
ElemType *betaScore,
ElemType *phoneSeq,
ElemType *phoneBound,
const size_t *uttToChanInd,
const size_t *uttFrameNum,
const size_t *uttBeginFrame,
const size_t *uttPhoneNum,
const size_t numChannels,
const size_t uttNum,
const size_t t,
const size_t maxPhoneNum,
const size_t totalPhoneNum,
const int delayConstraint)
{
LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
// Index of the label in the sequence
LONG64 phoneSeqId = blockDim.y * blockIdx.y + threadIdx.y;
LONG64 phoneNum = uttPhoneNum[uttId];
LONG64 frameNum = uttFrameNum[uttId];
if (uttId >= uttNum || phoneSeqId >= phoneNum - 1 || t >= frameNum || phoneSeqId == 0) return;
LONG64 labelid = uttId*maxPhoneNum + phoneSeqId;
LONG64 labelid_2 = labelid + 2;
LONG64 phoneId = (LONG64)(phoneSeq[labelid]);
LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];
LONG64 probId = timeId*totalPhoneNum + phoneId;
LONG64 betaid = maxPhoneNum* timeId + phoneSeqId;
LONG64 timeId_1 = timeId + numChannels;
LONG64 betaid_0 = maxPhoneNum* timeId_1 + phoneSeqId;
LONG64 betaid_1 = betaid_0 + 1;
LONG64 betaid_2 = betaid_0 + 2;
if (t == frameNum - 1)
{
if (phoneSeqId == phoneNum - 3 || phoneSeqId == phoneNum - 2)
{
betaScore[betaid] = prob[probId];
}
}
else
{
if (phoneSeqId >= 1)
{
ElemType x = LZERO;
ElemType ascore;
if (phoneSeqId < phoneNum - 3)
{
if (phoneSeq[labelid] != totalPhoneNum - 1 && phoneId != phoneSeq[labelid_2])
{
x = logaddk(x, betaScore[betaid_2]);
}
}
if (phoneSeqId < phoneNum - 2)
{
x = logaddk(x, betaScore[betaid_1]);
}
x = logaddk(x, betaScore[betaid_0]);
if (phoneId != SIZE_MAX)
ascore = prob[probId];
else
ascore = 0;
betaScore[betaid] = (ElemType)x + ascore;
if (delayConstraint != -1)
{
LONG64 phoneBoundId_r = (LONG64)(phoneBound[labelid_2]);
if (phoneId == totalPhoneNum - 1)
{
if (t > phoneBoundId_r + delayConstraint - 1)
betaScore[betaid] = LZERO;
}
else if (phoneId != totalPhoneNum - 1)
{
if (t > phoneBoundId_r + delayConstraint)
betaScore[betaid] = LZERO;
}
}
}
}
}
// Calculate derivative, equation (15) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
// See _assignAlphaScore for the explanation of parameters
template<class ElemType>
__global__ void _assignCTCScore(
ElemType *CTCscore,
ElemType *prob,
ElemType *alphaScore,
ElemType *betaScore,
ElemType *phoneSeq,
const size_t uttNum,
const size_t *uttToChanInd,
const size_t *uttBeginFrame,
const size_t *uttPhoneNum,
const size_t *uttFrameNum,
const long numChannels,
const long maxPhoneNum,
const long totalPhoneNum)
{
LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
LONG64 t = blockDim.y * blockIdx.y + threadIdx.y;
if (uttId < uttNum && t < uttFrameNum[uttId])
{
LONG64 phoneNum = uttPhoneNum[uttId];
LONG64 alphaId_0 = (uttBeginFrame[uttId] * numChannels + uttToChanInd[uttId]) * maxPhoneNum;
LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];
ElemType P_lx = betaScore[alphaId_0];
for (int s = 1; s < phoneNum - 1; s++)
{
long phoneId = phoneSeq[uttId*maxPhoneNum + s];
LONG64 alphaId = maxPhoneNum* timeId + s;
LONG64 probId = timeId*totalPhoneNum + phoneId;
if (phoneId != SIZE_MAX)
{
ElemType logoccu = alphaScore[alphaId] + betaScore[alphaId] - prob[probId] - (ElemType)P_lx;
CTCscore[probId] = logaddk(CTCscore[probId], logoccu);
}
}
for (int s = 0; s < totalPhoneNum; s++)
{
LONG64 probId = timeId*totalPhoneNum + s;
ElemType logoccu = CTCscore[probId];
if (logoccu < LZERO)
CTCscore[probId] = 0.0f;
else
CTCscore[probId] = exp(logoccu);
}
}
}
// Calculate CTC score. equation (8) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
template<class ElemType>
__global__ void _assignTotalScore(ElemType *betaScore,
ElemType *totalScore,
const size_t uttNum,
const size_t *uttToChanInd,
const size_t *uttBeginFrame,
const size_t numChannels,
const size_t maxPhoneNum)
{
LONG64 uttId = blockIdx.x;
if (uttId < uttNum)
{
LONG64 alphaId_0 = (uttBeginFrame[uttId] * numChannels + uttToChanInd[uttId]) * maxPhoneNum;
betaScore[alphaId_0] = logaddk(betaScore[alphaId_0 + 1], betaScore[alphaId_0 + 2]);
totalScore[uttId] = betaScore[alphaId_0];
}
}
}
}
}

Просмотреть файл

@ -158,23 +158,6 @@ int GetMathLibTraceLevel()
MatrixBase::~MatrixBase() { }
#pragma region BufferManagement
std::unordered_map<DEVICEID_TYPE, std::unique_ptr<BufferManagement>> BufferManagement::m_instances;
template <>
std::multimap<size_t, float*>& BufferManagement::BufferContainer<float>() { return m_bufferFloatContainer; }
template <>
std::multimap<size_t, double*>& BufferManagement::BufferContainer<double>() { return m_bufferDoubleContainer; }
template <>
std::multimap<size_t, char*>& BufferManagement::BufferContainer<char>() { return m_bufferCharContainer; }
template <>
std::multimap<size_t, short*>& BufferManagement::BufferContainer<short>() { return m_bufferShortContainer; }
template <>
std::multimap<size_t, int*>& BufferManagement::BufferContainer<int>() { return m_bufferIntContainer; }
#pragma endregion
#pragma region Constructors, destructors and other static matrix builders
@ -184,10 +167,6 @@ std::multimap<size_t, int*>& BufferManagement::BufferContainer<int>() { return m
// { GPU code },
// ...
// By default, the CachedMatrixBuffer is disable
template <class ElemType>
bool Matrix<ElemType>::m_useCachedResize = false;
// Initialize members
template <class ElemType>
void Matrix<ElemType>::Init(DEVICEID_TYPE deviceId)
@ -301,9 +280,6 @@ void Matrix<ElemType>::SetDataLocation(CurrentDataLocation location, MatrixType
LogicError("SetDataLocation: New m_baseMatrix must not be NULL.");
}
template <class ElemType>
void Matrix<ElemType>::UseCachedResizeOrNot(bool useCachedResize) { m_useCachedResize = useCachedResize; }
//this is a private constructor only used internally to initialize a blank matrix
template <class ElemType>
Matrix<ElemType>::Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID)
@ -1829,7 +1805,7 @@ void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const
// TODO: should this function test whether the size is changing, and skip if it isn't? We have at least one explicit test for this code calling this (recurrent node)
DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(this,
{ m_CPUMatrix->Resize(numRows, numCols, growOnly); },
{ m_GPUMatrix->Resize(numRows, numCols, growOnly, m_useCachedResize); },
{ m_GPUMatrix->Resize(numRows, numCols, growOnly); },
{ m_CPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); },
{ m_GPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); });
#ifdef _DEBUG
@ -5736,6 +5712,51 @@ Matrix<ElemType>& Matrix<ElemType>::AssignSequenceError(const ElemType hsmoothin
NOT_IMPLEMENTED);
return *this;
}
// Calculate CTC score
// prob (input): the posterior output from the network
// alpha, beta (output): alpha and beta for forward-backward calculation.
// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance
// phoneBound (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance
// totalScore (output): total CTC score
// uttToChanInd (input): map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
// uttBeginFrame(input): the positon of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance.
// uttFrameNum (input): the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch
// uttPhoneNum (input): the phone number of each utterance. The size of this vector = the number of all utterances in this minibatch
// numParallelSequences (input): num of parallel sequences
// mbsize (input): the maximum channel frame number
// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference. This using the original time information to enforce that CTC tokens only get aligned within a time margin.
// Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy.
// delayConstraint=-1 means no constraint
template<class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignCTCScore(const Matrix<ElemType>& prob, Matrix<ElemType>& alpha, Matrix<ElemType>& beta,
const Matrix<ElemType>& phoneSeq, const Matrix<ElemType>& phoneBound, ElemType &totalScore, const std::vector<size_t> & uttToChanInd,
const std::vector<size_t> & uttBeginFrame, const std::vector<size_t> & uttFrameNum, const std::vector<size_t> & uttPhoneNum,
const size_t numParallelSequences, const size_t mbsize, const int delayConstraint, const bool isColWise)
{
DecideAndMoveToRightDevice(prob, *this);
alpha.Resize(phoneSeq.GetNumRows(), prob.GetNumCols());
beta.Resize(phoneSeq.GetNumRows(), prob.GetNumCols());
Resize(prob.GetNumRows(), prob.GetNumCols());
alpha.SetValue(LZERO);
beta.SetValue(LZERO);
SetValue(LZERO);
SwitchToMatrixType(prob.GetMatrixType(), prob.GetFormat(), false);
DISPATCH_MATRIX_ON_FLAG(&prob,
this,
this->m_CPUMatrix->AssignCTCScore(*prob.m_CPUMatrix, *alpha.m_CPUMatrix, *beta.m_CPUMatrix, *phoneSeq.m_CPUMatrix, *phoneBound.m_CPUMatrix, totalScore,
uttToChanInd, uttBeginFrame, uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, delayConstraint, isColWise),
this->m_GPUMatrix->AssignCTCScore(*prob.m_GPUMatrix, *alpha.m_GPUMatrix, *beta.m_GPUMatrix, *phoneSeq.m_GPUMatrix, *phoneBound.m_GPUMatrix, totalScore,
uttToChanInd, uttBeginFrame, uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, delayConstraint, isColWise),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED
);
return *this;
}
#pragma endregion Static BLAS Functions
// TensorView currently does not interface with sparse matrices. For now, we just catch this and throw.

Просмотреть файл

@ -87,9 +87,6 @@ private:
mutable size_t m_numTimesMatrixTypeChanged;
mutable int m_devicesTransferedTo[2]; // TODO: what is this for? Seems only diagnostics
// whether to use cached memory Resize() or not
static bool m_useCachedResize;
// Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
void _transferFromDeviceToDevice(int id_from, int id_to, bool isBeingMoved = true, bool emptyTransfer = false) const;
// Moves matrix from current device to device with id_to. This method doesn't change preferred device Id
@ -143,8 +140,6 @@ public:
SetDataLocation(GetDeviceId() < 0 ? CurrentDataLocation::CPU : CurrentDataLocation::GPU, GetMatrixType());
}
static void UseCachedResizeOrNot(bool useCachedResize);
private:
Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); // only used internally to initialize a blank matrix
Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID); // only used internally to initialize a blank matrix
@ -382,6 +377,11 @@ public:
// sequence training
Matrix<ElemType>& DropFrame(const Matrix<ElemType>& label, const Matrix<ElemType>& gamma, const ElemType& threshhold);
Matrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const Matrix<ElemType>& label, const Matrix<ElemType>& dnnoutput, const Matrix<ElemType>& gamma, ElemType alpha);
Matrix<ElemType>& AssignCTCScore(const Matrix<ElemType>& prob, Matrix<ElemType>& alpha, Matrix<ElemType>& beta, const Matrix<ElemType>& phoneSeq, const Matrix<ElemType>& phoneBound, ElemType &totalScore,
const vector<size_t> & extraUttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep,
const size_t mbSize, const int delayConstraint, const bool isColWise);
Matrix<ElemType>& InplaceSqrt();
Matrix<ElemType>& AssignSqrtOf(const Matrix<ElemType>& a);

Просмотреть файл

@ -26,10 +26,8 @@ NcclComm::NcclComm(int deviceId, const MPIWrapperPtr& mpi)
return;
size_t numRanks = mpi->NumNodesInUse();
MPI_Comm mpiComm = mpi->Communicator();
std::vector<int> allDevs(numRanks);
MPI_Allgather(&deviceId, 1, MPI_INT, allDevs.data(), 1, MPI_INT, mpiComm)
|| MpiFail("NcclComm: MPI_Allgather");
mpi->Allgather(&deviceId, 1, MPI_INT, allDevs.data(), 1, MPI_INT);
for (size_t r = 0; r<numRanks; r++)
{
@ -53,8 +51,7 @@ NcclComm::NcclComm(int deviceId, const MPIWrapperPtr& mpi)
if (res != ncclSuccess)
RuntimeError("NcclComm failed to obtain ncclUniqueId: %s", ncclGetErrorString(res));
MPI_Bcast(&ncclId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, mpiComm)
|| MpiFail("NcclComm: MPI_Bcase");
mpi->Bcast(&ncclId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0);
PrepareDevice(deviceId);
res = ncclCommInitRank(&m_ncclComm, numRanks, ncclId, mpi->CurrentNodeRank());

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше