merged from master
This commit is contained in:
Коммит
65bf17f4f4
|
@ -119,6 +119,12 @@
|
|||
<LinkIncremental>$(DebugBuild)</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemDefinitionGroup>
|
||||
<ClCompile>
|
||||
<PreprocessorDefinitions>HAS_MPI=1</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
</ItemDefinitionGroup>
|
||||
|
||||
<ItemDefinitionGroup Condition="'$(ConfigurationType)' == 'StaticLibrary'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
|
|
2
CNTK.sln
2
CNTK.sln
|
@ -1484,7 +1484,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BrainScript", "BrainScript"
|
|||
Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalExamplesTest", "Tests\EndToEndTests\EvalClientTests\CNTKLibraryCPPEvalExamplesTest\CNTKLibraryCPPEvalExamplesTest.vcxproj.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalExamplesTest", "Tests\EndToEndTests\EvalClientTests\CNTKLibraryCPPEvalExamplesTest\CNTKLibraryCPPEvalExamplesTest.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
|
||||
ProjectSection(ProjectDependencies) = postProject
|
||||
{91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715}
|
||||
{E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF}
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
// CNTKLibraryCPPEvalCPUOnlyExamples.cpp : Sample application shows how to evaluate a model using CNTK V2 API.
|
||||
//
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
void MultiThreadsEvaluation(bool);
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
fprintf(stderr, "\n##### Run CNTKLibraryCPPEvalCPUOnlyExamples on CPU. #####\n");
|
||||
MultiThreadsEvaluation(false);
|
||||
|
||||
fprintf(stderr, "Evaluation complete.\n");
|
||||
fflush(stderr);
|
||||
}
|
|
@ -1,20 +1,27 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="CNTKLibraryCPPEvalExamples.cpp" />
|
||||
<ClCompile Include="CNTKLibraryCPPEvalCPUOnlyExamples.cpp" />
|
||||
<ClCompile Include="EvalMultithreads.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="packages.config" />
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{D771A06D-CC25-4582-B5CD-D2A4782BB005}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>CNTKLibraryCPPEvalExamples</RootNamespace>
|
||||
<ProjectName>CNTKLibraryCPPEvalExamples</ProjectName>
|
||||
<RootNamespace>CNTKLibraryCPPEvalCPUOnlyExamples</RootNamespace>
|
||||
<ProjectName>CNTKLibraryCPPEvalCPUOnlyExamples</ProjectName>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
|
@ -24,6 +31,13 @@
|
|||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<WholeProgramOptimization>false</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
|
@ -31,12 +45,14 @@
|
|||
<PropertyGroup>
|
||||
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup>
|
||||
<ClCompile>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<PreprocessorDefinitions>WIN32;_CONSOLE;UNICODE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(SolutionDir)..\..\Include</AdditionalIncludeDirectories>
|
||||
<TreatWarningAsError>true</TreatWarningAsError>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
|
@ -45,8 +61,6 @@
|
|||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<AdditionalLibraryDirectories>$(SolutionDir)\..\..\cntk</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>CNTKLibrary-2.0.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
|
@ -55,7 +69,6 @@
|
|||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
|
||||
<AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
|
||||
<RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MultiThreaded</RuntimeLibrary>
|
||||
|
@ -66,7 +79,32 @@
|
|||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
|
||||
<AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
|
||||
<RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MultiThreadedDebug</RuntimeLibrary>
|
||||
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
|
||||
<MinimalRebuild>false</MinimalRebuild>
|
||||
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
<Import Project="..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets" Condition="Exists('..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets')" />
|
||||
</ImportGroup>
|
||||
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
|
||||
<PropertyGroup>
|
||||
<ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
|
||||
</PropertyGroup>
|
||||
<Error Condition="!Exists('..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets'))" />
|
||||
</Target>
|
||||
</Project>
|
|
@ -15,11 +15,14 @@
|
|||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="CNTKLibraryCPPEvalExamples.cpp">
|
||||
<ClCompile Include="CNTKLibraryCPPEvalCPUOnlyExamples.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="EvalMultithreads.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="packages.config" />
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="CNTK.CPUOnly" version="2.0-beta11" targetFramework="native" />
|
||||
</packages>
|
|
@ -1,30 +0,0 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
// CNTKLibraryCPPevalExamples.cpp : Sample application shows how to evaluate a model using CNTK V2 API.
|
||||
//
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
// define GPU_AVAILABLE, if you want to run evaluation on a GPU device. You also need CNTK GPU binaries.
|
||||
// undefine GPU_AVAILABLE, if you want to run evaluation on a CPU device.
|
||||
// #define GPU_AVAILABLE
|
||||
|
||||
void MultiThreadsEvaluation(bool);
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
#ifdef GPU_AVAILABLE
|
||||
fprintf(stderr, "\n##### Run eval on GPU device. #####\n");
|
||||
MultiThreadsEvaluation(true);
|
||||
#else
|
||||
fprintf(stderr, "\n##### Run eval on CPU device. #####\n");
|
||||
MultiThreadsEvaluation(false);
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "Evaluation complete.\n");
|
||||
|
||||
fflush(stderr);
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
// CNTKLibraryCPPEvalGPUExamples.cpp : Sample application shows how to evaluate a model using CNTK V2 API.
|
||||
//
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
void MultiThreadsEvaluation(bool);
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
fprintf(stderr, "\n##### Run CNTKLibraryCPPEvalGPUExamples on CPU and GPU. #####\n");
|
||||
MultiThreadsEvaluation(true);
|
||||
|
||||
fprintf(stderr, "Evaluation complete.\n");
|
||||
fflush(stderr);
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\CNTKLibraryCPPEvalCPUOnlyExamples\EvalMultithreads.cpp" />
|
||||
<ClCompile Include="CNTKLibraryCPPEvalGPUExamples.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="packages.config" />
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{13489884-3A6A-4023-8CF1-D8C78DDAF952}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>CNTKLibraryCPPEvalGPUExamples</RootNamespace>
|
||||
<ProjectName>CNTKLibraryCPPEvalGPUExamples</ProjectName>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<WholeProgramOptimization>false</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup>
|
||||
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup>
|
||||
<ClCompile>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<PreprocessorDefinitions>WIN32;_CONSOLE;UNICODE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<TreatWarningAsError>true</TreatWarningAsError>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
<OpenMPSupport>true</OpenMPSupport>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
|
||||
<AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
|
||||
<RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MultiThreaded</RuntimeLibrary>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
|
||||
<AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
|
||||
<RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MultiThreadedDebug</RuntimeLibrary>
|
||||
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
|
||||
<MinimalRebuild>false</MinimalRebuild>
|
||||
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
<Import Project="..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets" Condition="Exists('..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets')" />
|
||||
</ImportGroup>
|
||||
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
|
||||
<PropertyGroup>
|
||||
<ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
|
||||
</PropertyGroup>
|
||||
<Error Condition="!Exists('..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets'))" />
|
||||
</Target>
|
||||
</Project>
|
|
@ -0,0 +1,28 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<Filter Include="Source Files">
|
||||
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
|
||||
<Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Header Files">
|
||||
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
|
||||
<Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Resource Files">
|
||||
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
|
||||
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\CNTKLibraryCPPEvalCPUOnlyExamples\EvalMultithreads.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="CNTKLibraryCPPEvalGPUExamples.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="packages.config" />
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="CNTK.GPU" version="2.0-beta11" targetFramework="native" />
|
||||
</packages>
|
|
@ -39,7 +39,7 @@
|
|||
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, processorArchitecture=AMD64">
|
||||
<Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, PublicKeyToken=21fff2ec8197defe, processorArchitecture=AMD64">
|
||||
<HintPath>..\packages\CNTK.CPUOnly.2.0-beta11\lib\net45\x64\CNTKLibraryManaged-2.0.dll</HintPath>
|
||||
<Private>True</Private>
|
||||
</Reference>
|
||||
|
@ -72,4 +72,4 @@
|
|||
<Target Name="AfterBuild">
|
||||
</Target>
|
||||
-->
|
||||
</Project>
|
||||
</Project>
|
|
@ -1,4 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="CNTK.CPUOnly" version="2.0-beta11" targetFramework="net45" />
|
||||
</packages>
|
||||
</packages>
|
|
@ -39,7 +39,7 @@
|
|||
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, processorArchitecture=AMD64">
|
||||
<Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, PublicKeyToken=a82c1f3f67b62253, processorArchitecture=AMD64">
|
||||
<HintPath>..\packages\CNTK.GPU.2.0-beta11\lib\net45\x64\CNTKLibraryManaged-2.0.dll</HintPath>
|
||||
<Private>True</Private>
|
||||
</Reference>
|
||||
|
@ -76,4 +76,4 @@
|
|||
<Target Name="AfterBuild">
|
||||
</Target>
|
||||
-->
|
||||
</Project>
|
||||
</Project>
|
|
@ -1,4 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="CNTK.GPU" version="2.0-beta11" targetFramework="net45" />
|
||||
</packages>
|
||||
</packages>
|
|
@ -3,19 +3,22 @@ Microsoft Visual Studio Solution File, Format Version 12.00
|
|||
# Visual Studio 14
|
||||
VisualStudioVersion = 14.0.25420.1
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalExamples", "CNTKLibraryCPPEvalExamples\CNTKLibraryCPPEvalExamples.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalCPUOnlyExamples", "CNTKLibraryCPPEvalCPUOnlyExamples\CNTKLibraryCPPEvalCPUOnlyExamples.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSEvalCPUOnlyExamples", "CNTKLibraryCSEvalCPUOnlyExamples\CNTKLibraryCSEvalCPUOnlyExamples.csproj", "{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSEvalGPUExamples", "CNTKLibraryCSEvalGPUExamples\CNTKLibraryCSEvalGPUExamples.csproj", "{307E5BAC-DA03-45D2-ADEC-FE6620090109}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalGPUExamples", "CNTKLibraryCPPEvalGPUExamples\CNTKLibraryCPPEvalGPUExamples.vcxproj", "{13489884-3A6A-4023-8CF1-D8C78DDAF952}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|x64 = Debug|x64
|
||||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.ActiveCfg = Release|x64
|
||||
{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.Build.0 = Debug|x64
|
||||
{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.ActiveCfg = Release|x64
|
||||
{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.Build.0 = Release|x64
|
||||
{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}.Debug|x64.ActiveCfg = Debug|x64
|
||||
|
@ -26,6 +29,10 @@ Global
|
|||
{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Debug|x64.Build.0 = Debug|x64
|
||||
{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Release|x64.ActiveCfg = Release|x64
|
||||
{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Release|x64.Build.0 = Release|x64
|
||||
{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Debug|x64.Build.0 = Debug|x64
|
||||
{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Release|x64.ActiveCfg = Release|x64
|
||||
{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
|
@ -20,17 +24,30 @@
|
|||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
|
@ -38,29 +55,65 @@
|
|||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(SolutionDir)..\..\Include</AdditionalIncludeDirectories>
|
||||
<PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<TreatWarningAsError>true</TreatWarningAsError>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
<OpenMPSupport>true</OpenMPSupport>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>$(SolutionDir)\..\..\cntk</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>EvalDll.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
|
||||
<Profile>true</Profile>
|
||||
<Profile>false</Profile>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<TreatWarningAsError>true</TreatWarningAsError>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
<OpenMPSupport>true</OpenMPSupport>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
|
||||
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
|
||||
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
|
||||
<Profile>false</Profile>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="CPPEvalClient.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="packages.config" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
<Import Project="..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets" Condition="Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" />
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
|
||||
<PropertyGroup>
|
||||
<ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
|
||||
</PropertyGroup>
|
||||
<Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
|
||||
</Target>
|
||||
</Project>
|
|
@ -19,4 +19,7 @@
|
|||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
<ItemGroup>
|
||||
<None Include="packages.config" />
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="Microsoft.Research.CNTK.CpuEval-mkl" version="2.0-beta11" targetFramework="native" />
|
||||
</packages>
|
|
@ -1,6 +1,10 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
|
@ -20,17 +24,30 @@
|
|||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
|
@ -38,29 +55,63 @@
|
|||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(SolutionDir)..\..\Include</AdditionalIncludeDirectories>
|
||||
<PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<TreatWarningAsError>true</TreatWarningAsError>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
<OpenMPSupport>true</OpenMPSupport>
|
||||
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>$(SolutionDir)\..\..\cntk</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>EvalDll.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
|
||||
<Profile>true</Profile>
|
||||
<Profile>false</Profile>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<TreatWarningAsError>true</TreatWarningAsError>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<MultiProcessorCompilation>true</MultiProcessorCompilation>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
<OpenMPSupport>true</OpenMPSupport>
|
||||
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
|
||||
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
|
||||
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
|
||||
<Profile>false</Profile>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="CPPEvalExtendedClient.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="packages.config" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
<Import Project="..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets" Condition="Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" />
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
|
||||
<PropertyGroup>
|
||||
<ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
|
||||
</PropertyGroup>
|
||||
<Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
|
||||
</Target>
|
||||
</Project>
|
|
@ -19,4 +19,7 @@
|
|||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="packages.config" />
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="Microsoft.Research.CNTK.CpuEval-mkl" version="2.0-beta11" targetFramework="native" />
|
||||
</packages>
|
|
@ -95,4 +95,4 @@
|
|||
</PropertyGroup>
|
||||
<Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
|
||||
</Target>
|
||||
</Project>
|
||||
</Project>
|
|
@ -1,4 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="Microsoft.Research.CNTK.CpuEval-mkl" version="2.0-beta11" targetFramework="net45" />
|
||||
</packages>
|
||||
</packages>
|
|
@ -15,14 +15,16 @@ Global
|
|||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Debug|x64.ActiveCfg = Release|x64
|
||||
{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Debug|x64.Build.0 = Debug|x64
|
||||
{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Release|x64.ActiveCfg = Release|x64
|
||||
{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Release|x64.Build.0 = Release|x64
|
||||
{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Debug|x64.Build.0 = Debug|x64
|
||||
{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Release|x64.ActiveCfg = Release|x64
|
||||
{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Release|x64.Build.0 = Release|x64
|
||||
{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Debug|x64.ActiveCfg = Release|x64
|
||||
{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Debug|x64.Build.0 = Debug|x64
|
||||
{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Release|x64.ActiveCfg = Release|x64
|
||||
{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
|
|
|
@ -16,6 +16,7 @@ from cntk.utils import *
|
|||
from cntk.ops import *
|
||||
from cntk.distributed import data_parallel_distributed_learner, Communicator
|
||||
from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
|
||||
import cntk.io.transforms as xforms
|
||||
from cntk.layers import Placeholder, Block, Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options, Sequential
|
||||
from cntk.initializer import normal
|
||||
|
||||
|
@ -41,15 +42,15 @@ def create_image_mb_source(map_file, is_training, total_number_of_samples):
|
|||
transforms = []
|
||||
if is_training:
|
||||
transforms += [
|
||||
ImageDeserializer.crop(crop_type='randomside', side_ratio=0.88671875, jitter_type='uniratio') # train uses jitter
|
||||
xforms.crop(crop_type='randomside', side_ratio=0.88671875, jitter_type='uniratio') # train uses jitter
|
||||
]
|
||||
else:
|
||||
else:
|
||||
transforms += [
|
||||
ImageDeserializer.crop(crop_type='center', side_ratio=0.88671875) # test has no jitter
|
||||
xforms.crop(crop_type='center', side_ratio=0.88671875) # test has no jitter
|
||||
]
|
||||
|
||||
transforms += [
|
||||
ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
]
|
||||
|
||||
# deserializer
|
||||
|
@ -57,27 +58,27 @@ def create_image_mb_source(map_file, is_training, total_number_of_samples):
|
|||
ImageDeserializer(map_file, StreamDefs(
|
||||
features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
|
||||
labels = StreamDef(field='label', shape=num_classes))), # and second as 'label'
|
||||
randomize = is_training,
|
||||
randomize = is_training,
|
||||
epoch_size=total_number_of_samples,
|
||||
multithreaded_deserializer = True)
|
||||
|
||||
# Local Response Normalization layer. See Section 3.3 of the paper:
|
||||
# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
|
||||
# The mathematical equation is:
|
||||
# Local Response Normalization layer. See Section 3.3 of the paper:
|
||||
# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
|
||||
# The mathematical equation is:
|
||||
# b_{x,y}^i=a_{x,y}^i/(k+\alpha\sum_{j=max(0,i-n)}^{min(N-1, i+n)}(a_{x,y}^j)^2)^\beta
|
||||
# where a_{x,y}^i is the activity of a neuron comoputed by applying kernel i at position (x,y)
|
||||
# N is the total number of kernals, n is half normalization width.
|
||||
def LocalResponseNormalization(k, n, alpha, beta, name=''):
|
||||
x = cntk.blocks.Placeholder(name='lrn_arg')
|
||||
x2 = cntk.ops.square(x)
|
||||
# reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed.
|
||||
# N is the total number of kernals, n is half normalization width.
|
||||
def LocalResponseNormalization(k, n, alpha, beta, name=''):
|
||||
x = cntk.blocks.Placeholder(name='lrn_arg')
|
||||
x2 = cntk.ops.square(x)
|
||||
# reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed.
|
||||
x2s = cntk.ops.reshape(x2, (1, cntk.InferredDimension), 0, 1)
|
||||
W = cntk.ops.constant(alpha/(2*n+1), (1,2*n+1,1,1), name='W')
|
||||
# 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
|
||||
y = cntk.ops.convolution (W, x2s)
|
||||
# reshape back to remove the fake singleton reduction dimension
|
||||
b = cntk.ops.reshape(y, cntk.InferredDimension, 0, 2)
|
||||
den = cntk.ops.exp(beta * cntk.ops.log(k + b))
|
||||
den = cntk.ops.exp(beta * cntk.ops.log(k + b))
|
||||
apply_x = cntk.ops.element_divide(x, den)
|
||||
return cntk.blocks.Block(apply_x, 'LocalResponseNormalization', name, make_block=True)
|
||||
|
||||
|
@ -89,35 +90,35 @@ def create_alexnet():
|
|||
label_var = input_variable((num_classes))
|
||||
|
||||
# apply model to input
|
||||
# remove mean value
|
||||
# remove mean value
|
||||
input = minus(feature_var, constant(114), name='mean_removed_input')
|
||||
|
||||
|
||||
with default_options(activation=None, pad=True, bias=True):
|
||||
z = Sequential([
|
||||
# we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU)
|
||||
Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'),
|
||||
Activation(activation=relu, name='relu1'),
|
||||
# we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU)
|
||||
Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'),
|
||||
Activation(activation=relu, name='relu1'),
|
||||
LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm1'),
|
||||
MaxPooling((3,3), (2,2), name='pool1'),
|
||||
|
||||
Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'),
|
||||
Activation(activation=relu, name='relu2'),
|
||||
|
||||
Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'),
|
||||
Activation(activation=relu, name='relu2'),
|
||||
LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm2'),
|
||||
MaxPooling((3,3), (2,2), name='pool2'),
|
||||
|
||||
Convolution2D((3,3), 384, init=normal(0.01), name='conv3'),
|
||||
Activation(activation=relu, name='relu3'),
|
||||
Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'),
|
||||
Activation(activation=relu, name='relu4'),
|
||||
Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'),
|
||||
Activation(activation=relu, name='relu5'),
|
||||
MaxPooling((3,3), (2,2), name='pool5'),
|
||||
|
||||
Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'),
|
||||
Activation(activation=relu, name='relu6'),
|
||||
Dropout(0.5, name='drop6'),
|
||||
Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'),
|
||||
Activation(activation=relu, name='relu7'),
|
||||
|
||||
Convolution2D((3,3), 384, init=normal(0.01), name='conv3'),
|
||||
Activation(activation=relu, name='relu3'),
|
||||
Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'),
|
||||
Activation(activation=relu, name='relu4'),
|
||||
Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'),
|
||||
Activation(activation=relu, name='relu5'),
|
||||
MaxPooling((3,3), (2,2), name='pool5'),
|
||||
|
||||
Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'),
|
||||
Activation(activation=relu, name='relu6'),
|
||||
Dropout(0.5, name='drop6'),
|
||||
Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'),
|
||||
Activation(activation=relu, name='relu7'),
|
||||
Dropout(0.5, name='drop7'),
|
||||
Dense(num_classes, init=normal(0.01), name='fc8')
|
||||
])(input)
|
||||
|
@ -134,7 +135,7 @@ def create_alexnet():
|
|||
'label': label_var,
|
||||
'ce' : ce,
|
||||
'pe' : pe,
|
||||
'pe5': pe5,
|
||||
'pe5': pe5,
|
||||
'output': z
|
||||
}
|
||||
|
||||
|
@ -145,10 +146,10 @@ def create_trainer(network, epoch_size, num_quantization_bits):
|
|||
lr_schedule = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
|
||||
mm_schedule = cntk.learner.momentum_schedule(0.9)
|
||||
l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
|
||||
|
||||
|
||||
# Create learner
|
||||
local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight)
|
||||
# Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency
|
||||
# Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency
|
||||
parameter_learner = data_parallel_distributed_learner(
|
||||
local_learner,
|
||||
num_quantization_bits=num_quantization_bits,
|
||||
|
@ -167,25 +168,25 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
|
|||
}
|
||||
|
||||
training_session = cntk.training_session(
|
||||
training_minibatch_source = train_source,
|
||||
training_minibatch_source = train_source,
|
||||
trainer = trainer,
|
||||
model_inputs_to_mb_source_mapping = input_map,
|
||||
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
|
||||
progress_printer = progress_printer,
|
||||
model_inputs_to_mb_source_mapping = input_map,
|
||||
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
|
||||
progress_printer = progress_printer,
|
||||
# checkpoint_frequency = epoch_size,
|
||||
checkpoint_filename = os.path.join(model_path, model_name),
|
||||
checkpoint_filename = os.path.join(model_path, model_name),
|
||||
# save_all_checkpoints = True,
|
||||
progress_frequency = epoch_size,
|
||||
cv_source = test_source,
|
||||
progress_frequency = epoch_size,
|
||||
cv_source = test_source,
|
||||
cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
|
||||
# cv_frequency = epoch_size,
|
||||
restore = restore)
|
||||
|
||||
# Train all minibatches
|
||||
# Train all minibatches
|
||||
training_session.train()
|
||||
|
||||
# Train and evaluate the network.
|
||||
def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=256, epoch_size = 1281167, max_epochs=112,
|
||||
def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=256, epoch_size = 1281167, max_epochs=112,
|
||||
restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=True):
|
||||
_cntk_py.set_computation_network_trace_level(0)
|
||||
|
||||
|
@ -202,10 +203,10 @@ def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, mini
|
|||
train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
|
||||
test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
|
||||
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
|
||||
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=data_path)
|
||||
|
@ -233,8 +234,8 @@ if __name__=='__main__':
|
|||
test_data=os.path.join(data_path, 'val_map.txt')
|
||||
|
||||
try:
|
||||
alexnet_train_and_eval(train_data, test_data,
|
||||
minibatch_size=args['minibatch_size'],
|
||||
alexnet_train_and_eval(train_data, test_data,
|
||||
minibatch_size=args['minibatch_size'],
|
||||
epoch_size=args['epoch_size'],
|
||||
num_quantization_bits=args['quantized_bits'],
|
||||
max_epochs=args['num_epochs'],
|
||||
|
@ -243,4 +244,4 @@ if __name__=='__main__':
|
|||
num_mbs_per_log=200,
|
||||
gen_heartbeat=True)
|
||||
finally:
|
||||
cntk.distributed.Communicator.finalize()
|
||||
cntk.distributed.Communicator.finalize()
|
||||
|
|
|
@ -32,7 +32,7 @@ TrainConvNet = {
|
|||
x2s = SplitDimension(x2, 3, 1)
|
||||
# 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
|
||||
W = ParameterTensor{(1:1:2*n+1:1), learningRateMultiplier = 0, initValue = alpha/(2*n+1)}
|
||||
y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose = false, maxTempMemSizeInSamples = 0)
|
||||
y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, maxTempMemSizeInSamples = 0)
|
||||
# reshape back to remove the fake singleton reduction dimension
|
||||
b = FlattenDimensions(y, 3, 2)
|
||||
den = Exp (beta .* Log(k + b))
|
||||
|
|
|
@ -10,8 +10,9 @@ import math
|
|||
import numpy as np
|
||||
import cntk
|
||||
import _cntk_py
|
||||
import cntk.io.transforms as xforms
|
||||
|
||||
# Paths relative to current python file.
|
||||
# Paths relative to current python file.
|
||||
abs_path = os.path.dirname(os.path.abspath(__file__))
|
||||
data_path = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10")
|
||||
model_path = os.path.join(abs_path, "Models")
|
||||
|
@ -32,11 +33,11 @@ def create_reader(map_file, mean_file, is_training):
|
|||
transforms = []
|
||||
if is_training:
|
||||
transforms += [
|
||||
cntk.io.ImageDeserializer.crop(crop_type='RandomSide', side_ratio=0.8, jitter_type='uniRatio') # train uses jitter
|
||||
xforms.crop(crop_type='RandomSide', side_ratio=0.8, jitter_type='uniRatio') # train uses jitter
|
||||
]
|
||||
transforms += [
|
||||
cntk.io.ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
cntk.io.ImageDeserializer.mean(mean_file)
|
||||
xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
xforms.mean(mean_file)
|
||||
]
|
||||
# deserializer
|
||||
return cntk.io.MinibatchSource(cntk.io.ImageDeserializer(map_file, cntk.io.StreamDefs(
|
||||
|
@ -44,23 +45,23 @@ def create_reader(map_file, mean_file, is_training):
|
|||
labels = cntk.io.StreamDef(field='label', shape=num_classes))), # and second as 'label'
|
||||
randomize=is_training)
|
||||
|
||||
# Local Response Normalization layer. See Section 3.3 of the paper:
|
||||
# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
|
||||
# The mathematical equation is:
|
||||
# Local Response Normalization layer. See Section 3.3 of the paper:
|
||||
# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
|
||||
# The mathematical equation is:
|
||||
# b_{x,y}^i=a_{x,y}^i/(k+\alpha\sum_{j=max(0,i-n)}^{min(N-1, i+n)}(a_{x,y}^j)^2)^\beta
|
||||
# where a_{x,y}^i is the activity of a neuron comoputed by applying kernel i at position (x,y)
|
||||
# N is the total number of kernals, n is half normalization width.
|
||||
def LocalResponseNormalization(k, n, alpha, beta, name=''):
|
||||
x = cntk.blocks.Placeholder(name='lrn_arg')
|
||||
x2 = cntk.ops.square(x)
|
||||
# reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed.
|
||||
# N is the total number of kernals, n is half normalization width.
|
||||
def LocalResponseNormalization(k, n, alpha, beta, name=''):
|
||||
x = cntk.blocks.Placeholder(name='lrn_arg')
|
||||
x2 = cntk.ops.square(x)
|
||||
# reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed.
|
||||
x2s = cntk.ops.reshape(x2, (1, cntk.InferredDimension), 0, 1)
|
||||
W = cntk.ops.constant(alpha/(2*n+1), (1,2*n+1,1,1), name='W')
|
||||
# 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
|
||||
y = cntk.ops.convolution (W, x2s)
|
||||
# reshape back to remove the fake singleton reduction dimension
|
||||
b = cntk.ops.reshape(y, cntk.InferredDimension, 0, 2)
|
||||
den = cntk.ops.exp(beta * cntk.ops.log(k + b))
|
||||
den = cntk.ops.exp(beta * cntk.ops.log(k + b))
|
||||
apply_x = cntk.ops.element_divide(x, den)
|
||||
return cntk.blocks.Block(apply_x, 'LocalResponseNormalization', name, make_block=True)
|
||||
|
||||
|
@ -75,18 +76,18 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
|
|||
# apply model to input
|
||||
scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var)
|
||||
|
||||
with cntk.layers.default_options (activation=cntk.ops.relu, pad=True):
|
||||
with cntk.layers.default_options (activation=cntk.ops.relu, pad=True):
|
||||
z = cntk.models.Sequential([
|
||||
cntk.models.For(range(2), lambda : [
|
||||
cntk.layers.Convolution2D((3,3), 64),
|
||||
cntk.layers.Convolution2D((3,3), 64),
|
||||
cntk.layers.Convolution2D((3,3), 64),
|
||||
cntk.layers.Convolution2D((3,3), 64),
|
||||
LocalResponseNormalization (1.0, 4, 0.001, 0.75),
|
||||
cntk.layers.MaxPooling((3,3), (2,2))
|
||||
]),
|
||||
]),
|
||||
cntk.models.For(range(2), lambda i: [
|
||||
cntk.layers.Dense([256,128][i]),
|
||||
cntk.layers.Dense([256,128][i]),
|
||||
cntk.layers.Dropout(0.5)
|
||||
]),
|
||||
]),
|
||||
cntk.layers.Dense(num_classes, activation=None)
|
||||
])(scaled_input)
|
||||
|
||||
|
@ -103,7 +104,7 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
|
|||
mm_time_constant = [0]*20 + [600]*20 + [1200]
|
||||
mm_schedule = cntk.learner.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
|
||||
l2_reg_weight = 0.002
|
||||
|
||||
|
||||
# trainer object
|
||||
learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule,
|
||||
unit_gain = True,
|
||||
|
@ -117,7 +118,7 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
|
|||
}
|
||||
|
||||
cntk.utils.log_number_of_parameters(z) ; print()
|
||||
progress_printer = cntk.utils.ProgressPrinter(tag='Training')
|
||||
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
|
||||
|
||||
# perform model training
|
||||
for epoch in range(max_epochs): # loop over epochs
|
||||
|
@ -130,7 +131,7 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
|
|||
|
||||
progress_printer.epoch_summary(with_metric=True)
|
||||
z.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch)))
|
||||
|
||||
|
||||
### Evaluation action
|
||||
epoch_size = 10000
|
||||
minibatch_size = 16
|
||||
|
|
|
@ -84,10 +84,10 @@ def convnet_cifar10(debug_output=False):
|
|||
}
|
||||
|
||||
cntk.utils.log_number_of_parameters(z) ; print()
|
||||
progress_printer = cntk.utils.ProgressPrinter(tag='Training')
|
||||
max_epochs = 30
|
||||
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
|
||||
|
||||
# Get minibatches of images to train with and perform model training
|
||||
max_epochs = 30
|
||||
for epoch in range(max_epochs): # loop over epochs
|
||||
sample_count = 0
|
||||
while sample_count < epoch_size: # loop over minibatches in the epoch
|
||||
|
|
|
@ -8,6 +8,9 @@ from __future__ import print_function
|
|||
import os
|
||||
import math
|
||||
import numpy as np
|
||||
import cntk
|
||||
import _cntk_py
|
||||
import cntk.io.transforms as xforms
|
||||
|
||||
from cntk.layers import Convolution2D, MaxPooling, AveragePooling, Dropout, BatchNormalization, Dense, default_options, Placeholder, identity, Sequential, For
|
||||
from cntk.layers.typing import *
|
||||
|
@ -47,11 +50,11 @@ def create_reader(map_file, mean_file, is_training):
|
|||
transforms = []
|
||||
if is_training:
|
||||
transforms += [
|
||||
ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
|
||||
xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
|
||||
]
|
||||
transforms += [
|
||||
ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
ImageDeserializer.mean(mean_file)
|
||||
xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
xforms.mean(mean_file)
|
||||
]
|
||||
# deserializer
|
||||
return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
|
||||
|
@ -142,6 +145,10 @@ def train_and_evaluate(reader, reader_test, model, epoch_size=50000, max_epochs=
|
|||
# TODO: we should be done here
|
||||
#return metric_numer/metric_denom
|
||||
|
||||
progress_printer.epoch_summary(with_metric=True)
|
||||
z.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch)))
|
||||
|
||||
### Evaluation action
|
||||
|
||||
# evaluate with current Trainer instance; just to make sure we save and load the model correctly and BN works now --TODO: delete once confirmed
|
||||
epoch_size = 10000
|
||||
|
|
|
@ -11,6 +11,7 @@ import argparse
|
|||
import numpy as np
|
||||
import cntk
|
||||
import _cntk_py
|
||||
import cntk.io.transforms as xforms
|
||||
|
||||
# default Paths relative to current python file.
|
||||
abs_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
@ -32,12 +33,12 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
|
|||
transforms = []
|
||||
if train:
|
||||
transforms += [
|
||||
cntk.io.ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
|
||||
xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
|
||||
]
|
||||
|
||||
transforms += [
|
||||
cntk.io.ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
cntk.io.ImageDeserializer.mean(mean_file)
|
||||
xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
xforms.mean(mean_file)
|
||||
]
|
||||
|
||||
# deserializer
|
||||
|
@ -45,7 +46,7 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
|
|||
cntk.io.ImageDeserializer(map_file, cntk.io.StreamDefs(
|
||||
features = cntk.io.StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
|
||||
labels = cntk.io.StreamDef(field='label', shape=num_classes))), # and second as 'label'
|
||||
randomize=train,
|
||||
randomize=train,
|
||||
epoch_size=total_number_of_samples,
|
||||
multithreaded_deserializer = True)
|
||||
|
||||
|
@ -58,18 +59,18 @@ def create_conv_network():
|
|||
|
||||
# apply model to input
|
||||
scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), feature_var)
|
||||
|
||||
|
||||
with cntk.layers.default_options(activation=cntk.ops.relu, pad=True):
|
||||
z = cntk.models.Sequential([
|
||||
cntk.models.For(range(2), lambda : [
|
||||
cntk.layers.Convolution2D((3,3), 64),
|
||||
cntk.layers.Convolution2D((3,3), 64),
|
||||
cntk.layers.MaxPooling((3,3), (2,2))
|
||||
]),
|
||||
]),
|
||||
cntk.models.For(range(2), lambda i: [
|
||||
cntk.layers.Dense([256,128][i]),
|
||||
cntk.layers.Dense([256,128][i]),
|
||||
cntk.layers.Dropout(0.5)
|
||||
]),
|
||||
]),
|
||||
cntk.layers.Dense(num_classes, activation=None)
|
||||
])(scaled_input)
|
||||
|
||||
|
@ -96,13 +97,13 @@ def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_
|
|||
mm_time_constant = [0]*20 + [600]*20 + [1200]
|
||||
mm_schedule = cntk.learner.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
|
||||
l2_reg_weight = 0.002
|
||||
|
||||
|
||||
# Create learner
|
||||
if block_size != None and num_quantization_bits != 32:
|
||||
raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.")
|
||||
|
||||
local_learner = cntk.learner.momentum_sgd(network['output'].parameters,
|
||||
lr_schedule, mm_schedule,
|
||||
local_learner = cntk.learner.momentum_sgd(network['output'].parameters,
|
||||
lr_schedule, mm_schedule,
|
||||
l2_regularization_weight=l2_reg_weight)
|
||||
|
||||
if block_size != None:
|
||||
|
@ -125,12 +126,12 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
|
|||
training_session = cntk.training_session(
|
||||
training_minibatch_source = train_source,
|
||||
trainer = trainer,
|
||||
model_inputs_to_mb_source_mapping = input_map,
|
||||
model_inputs_to_mb_source_mapping = input_map,
|
||||
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
|
||||
progress_printer = progress_printer,
|
||||
checkpoint_frequency = epoch_size,
|
||||
progress_printer = progress_printer,
|
||||
checkpoint_frequency = epoch_size,
|
||||
checkpoint_filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"),
|
||||
# save_all_checkpoints = False,
|
||||
# save_all_checkpoints = False,
|
||||
progress_frequency=epoch_size,
|
||||
cv_source = test_source,
|
||||
cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size),
|
||||
|
@ -147,8 +148,8 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
|
|||
cntk.stop_profiler()
|
||||
|
||||
# Train and evaluate the network.
|
||||
def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64, epoch_size=50000, num_quantization_bits=32,
|
||||
block_size=3200, warm_up=0, max_epochs=2, restore=False, log_to_file=None,
|
||||
def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64, epoch_size=50000, num_quantization_bits=32,
|
||||
block_size=3200, warm_up=0, max_epochs=2, restore=False, log_to_file=None,
|
||||
num_mbs_per_log=None, gen_heartbeat=False, profiling=False):
|
||||
_cntk_py.set_computation_network_trace_level(0)
|
||||
|
||||
|
@ -165,10 +166,10 @@ def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64,
|
|||
train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
|
||||
test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
|
||||
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore, profiling)
|
||||
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
data_path = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10")
|
||||
|
||||
|
@ -201,8 +202,8 @@ if __name__=='__main__':
|
|||
test_data=os.path.join(data_path, 'test_map.txt')
|
||||
|
||||
try:
|
||||
convnet_cifar10_dataaug(train_data, test_data, mean_data,
|
||||
minibatch_size=args['minibatch_size'],
|
||||
convnet_cifar10_dataaug(train_data, test_data, mean_data,
|
||||
minibatch_size=args['minibatch_size'],
|
||||
epoch_size=args['epoch_size'],
|
||||
num_quantization_bits=args['quantized_bits'],
|
||||
block_size=args['block_samples'],
|
||||
|
|
|
@ -74,10 +74,10 @@ def convnet_mnist(debug_output=False):
|
|||
}
|
||||
|
||||
cntk.utils.log_number_of_parameters(z) ; print()
|
||||
progress_printer = cntk.utils.ProgressPrinter(tag='Training')
|
||||
max_epochs = 40
|
||||
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
|
||||
|
||||
# Get minibatches of images to train with and perform model training
|
||||
max_epochs = 40
|
||||
for epoch in range(max_epochs): # loop over epochs
|
||||
sample_count = 0
|
||||
while sample_count < epoch_size: # loop over minibatches in the epoch
|
||||
|
|
|
@ -13,7 +13,6 @@ modelPath = "$outputDir$/Models/ResNet_101"
|
|||
stderr = "$outputDir$/ResNet_101_BS_out"
|
||||
|
||||
parallelTrain = true
|
||||
hyperCompressMemory = true
|
||||
|
||||
TrainNetwork = {
|
||||
action = "train"
|
||||
|
|
|
@ -13,7 +13,6 @@ modelPath = "$outputDir$/Models/ResNet_152"
|
|||
stderr = "$outputDir$/ResNet_152_BS_out"
|
||||
|
||||
parallelTrain = true
|
||||
hyperCompressMemory = true
|
||||
|
||||
TrainNetwork = {
|
||||
action = "train"
|
||||
|
|
|
@ -13,6 +13,7 @@ import numpy as np
|
|||
from cntk.utils import *
|
||||
from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error
|
||||
from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs
|
||||
import cntk.io.transforms as xforms
|
||||
from cntk import Trainer, cntk_py
|
||||
from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType
|
||||
from _cntk_py import set_computation_network_trace_level
|
||||
|
@ -40,11 +41,11 @@ def create_reader(map_file, mean_file, train):
|
|||
transforms = []
|
||||
if train:
|
||||
transforms += [
|
||||
ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
|
||||
xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
|
||||
]
|
||||
transforms += [
|
||||
ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
ImageDeserializer.mean(mean_file)
|
||||
xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
xforms.mean(mean_file)
|
||||
]
|
||||
# deserializer
|
||||
return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
|
||||
|
@ -61,21 +62,21 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
|
|||
input_var = input_variable((num_channels, image_height, image_width))
|
||||
label_var = input_variable((num_classes))
|
||||
|
||||
# create model, and configure learning parameters
|
||||
if network_name == 'resnet20':
|
||||
# create model, and configure learning parameters
|
||||
if network_name == 'resnet20':
|
||||
z = create_cifar10_model(input_var, 3, num_classes)
|
||||
lr_per_mb = [1.0]*80+[0.1]*40+[0.01]
|
||||
elif network_name == 'resnet110':
|
||||
elif network_name == 'resnet110':
|
||||
z = create_cifar10_model(input_var, 18, num_classes)
|
||||
lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01]
|
||||
else:
|
||||
else:
|
||||
return RuntimeError("Unknown model name!")
|
||||
|
||||
# loss and metric
|
||||
ce = cross_entropy_with_softmax(z, label_var)
|
||||
pe = classification_error(z, label_var)
|
||||
|
||||
# shared training parameters
|
||||
# shared training parameters
|
||||
minibatch_size = 128
|
||||
momentum_time_constant = -minibatch_size/np.log(0.9)
|
||||
l2_reg_weight = 0.0001
|
||||
|
@ -84,7 +85,7 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
|
|||
lr_per_sample = [lr/minibatch_size for lr in lr_per_mb]
|
||||
lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
|
||||
mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)
|
||||
|
||||
|
||||
# trainer object
|
||||
learner = momentum_sgd(z.parameters, lr_schedule, mm_schedule,
|
||||
l2_regularization_weight = l2_reg_weight)
|
||||
|
@ -97,13 +98,13 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
|
|||
}
|
||||
|
||||
log_number_of_parameters(z) ; print()
|
||||
progress_printer = ProgressPrinter(tag='Training')
|
||||
progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
|
||||
|
||||
# perform model training
|
||||
|
||||
|
||||
if profiler_dir:
|
||||
start_profiler(profiler_dir, True)
|
||||
|
||||
|
||||
for epoch in range(max_epochs): # loop over epochs
|
||||
sample_count = 0
|
||||
while sample_count < epoch_size: # loop over minibatches in the epoch
|
||||
|
@ -114,10 +115,10 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
|
|||
progress_printer.epoch_summary(with_metric=True)
|
||||
z.save(os.path.join(model_path, network_name + "_{}.dnn".format(epoch)))
|
||||
enable_profiler() # begin to collect profiler data after first epoch
|
||||
|
||||
|
||||
if profiler_dir:
|
||||
stop_profiler()
|
||||
|
||||
|
||||
# Evaluation parameters
|
||||
test_epoch_size = 10000
|
||||
minibatch_size = 16
|
||||
|
@ -154,7 +155,7 @@ if __name__=='__main__':
|
|||
args = vars(parser.parse_args())
|
||||
epochs = int(args['epochs'])
|
||||
network_name = args['network']
|
||||
|
||||
|
||||
reader_train = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True)
|
||||
reader_test = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False)
|
||||
|
||||
|
|
|
@ -26,7 +26,6 @@ ImageC = 3
|
|||
NumLabels = 1000
|
||||
|
||||
parallelTrain = true
|
||||
hyperCompressMemory = true
|
||||
|
||||
################################
|
||||
Train = {
|
||||
|
|
|
@ -26,7 +26,6 @@ ImageC = 3
|
|||
NumLabels = 1000
|
||||
|
||||
parallelTrain = true
|
||||
hyperCompressMemory = true
|
||||
|
||||
################################
|
||||
Train = {
|
||||
|
|
|
@ -32,8 +32,6 @@ num_channels = 3 # RGB
|
|||
num_classes = 1000
|
||||
model_name = "VGG16.model"
|
||||
|
||||
cntk.cntk_py.enable_hyper_memory_compress()
|
||||
|
||||
# Create a minibatch source.
|
||||
def create_image_mb_source(map_file, is_training, total_number_of_samples):
|
||||
if not os.path.exists(map_file):
|
||||
|
|
|
@ -32,8 +32,6 @@ num_channels = 3 # RGB
|
|||
num_classes = 1000
|
||||
model_name = "VGG19.model"
|
||||
|
||||
cntk.cntk_py.enable_hyper_memory_compress()
|
||||
|
||||
# Create a minibatch source.
|
||||
def create_image_mb_source(map_file, is_training, total_number_of_samples):
|
||||
if not os.path.exists(map_file):
|
||||
|
|
|
@ -7,6 +7,9 @@
|
|||
from __future__ import print_function
|
||||
import zipfile
|
||||
import os
|
||||
from sys import platform
|
||||
import shutil
|
||||
|
||||
try:
|
||||
from urllib.request import urlretrieve
|
||||
except ImportError:
|
||||
|
@ -26,6 +29,15 @@ def download_grocery_data():
|
|||
print('Extracting ' + filename + '...')
|
||||
with zipfile.ZipFile(filename) as myzip:
|
||||
myzip.extractall(dataset_folder)
|
||||
if platform != "win32":
|
||||
testfile = os.path.join(dataset_folder, "grocery", "test.txt")
|
||||
unixfile = os.path.join(dataset_folder, "grocery", "test_unix.txt")
|
||||
out = open(unixfile, 'w')
|
||||
with open(testfile) as f:
|
||||
for line in f:
|
||||
out.write(line.replace('\\', '/'))
|
||||
out.close()
|
||||
shutil.move(unixfile, testfile)
|
||||
finally:
|
||||
os.remove(filename)
|
||||
print('Done.')
|
||||
|
@ -34,4 +46,4 @@ def download_grocery_data():
|
|||
|
||||
if __name__ == "__main__":
|
||||
download_grocery_data()
|
||||
|
||||
|
||||
|
|
|
@ -9,18 +9,15 @@ import os
|
|||
import numpy as np
|
||||
from cntk import load_model, graph
|
||||
from cntk.ops import combine
|
||||
from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs
|
||||
from cntk import graph
|
||||
from cntk.graph import get_node_outputs
|
||||
|
||||
import cntk.io.transforms as xforms
|
||||
|
||||
def create_mb_source(image_height, image_width, num_channels, map_file):
|
||||
transforms = [ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
|
||||
image_source = ImageDeserializer(map_file)
|
||||
image_source.ignore_labels()
|
||||
image_source.map_features('features', transforms)
|
||||
|
||||
return MinibatchSource(image_source, randomize=False)
|
||||
transforms = [xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
|
||||
return MinibatchSource(ImageDeserializer(map_file,
|
||||
StreamDefs(features=StreamDef(field='image', transforms=transforms))), # first column in map file is referred to as 'image'
|
||||
randomize=False) # second column is labels and is ignored
|
||||
|
||||
|
||||
def eval_and_write(model_file, node_name, output_file, minibatch_source, num_objects):
|
||||
|
|
|
@ -12,7 +12,8 @@ from cntk.device import set_default_device, gpu
|
|||
from cntk import load_model, Trainer, UnitType
|
||||
from cntk.layers import Placeholder, Constant
|
||||
from cntk.graph import find_by_name, get_node_outputs
|
||||
from cntk.io import MinibatchSource, ImageDeserializer
|
||||
from cntk.io import MinibatchSource, ImageDeserializer, StreamDefs, StreamDef
|
||||
import cntk.io.transforms as xforms
|
||||
from cntk.layers import Dense
|
||||
from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_schedule
|
||||
from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, combine, softmax
|
||||
|
@ -58,11 +59,11 @@ _num_classes = 102
|
|||
|
||||
# Creates a minibatch source for training or testing
|
||||
def create_mb_source(map_file, image_width, image_height, num_channels, num_classes, randomize=True):
|
||||
transforms = [ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
|
||||
image_source = ImageDeserializer(map_file)
|
||||
image_source.map_features(features_stream_name, transforms)
|
||||
image_source.map_labels(label_stream_name, num_classes)
|
||||
return MinibatchSource(image_source, randomize=randomize)
|
||||
transforms = [xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
|
||||
return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
|
||||
features =StreamDef(field='image', transforms=transforms),
|
||||
labels =StreamDef(field='label', shape=num_classes))),
|
||||
randomize=randomize)
|
||||
|
||||
|
||||
# Creates the network model for transfer learning
|
||||
|
|
|
@ -18,6 +18,10 @@ sys.path.append(os.path.join(base_folder, "..", "DataSets", "Animals"))
|
|||
from install_animals import download_animals_data
|
||||
download_animals_data()
|
||||
|
||||
sys.path.append(os.path.join(base_folder, "..", "DataSets", "Grocery"))
|
||||
from install_grocery import download_grocery_data
|
||||
download_grocery_data()
|
||||
|
||||
sys.path.append(os.path.join(base_folder, "..", "PretrainedModels"))
|
||||
from models_util import download_model_by_name
|
||||
download_model_by_name("ResNet_18")
|
||||
|
|
|
@ -19,8 +19,7 @@ from cntk.ops import cross_entropy_with_softmax, classification_error, splice, r
|
|||
# variables and stuff #
|
||||
########################
|
||||
|
||||
cntk_dir = os.path.dirname(os.path.abspath(__file__)) + "/../../../.." # data resides in the CNTK folder
|
||||
data_dir = cntk_dir + "/Examples/LanguageUnderstanding/ATIS/Data" # under Examples/LanguageUnderstanding/ATIS
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "Data")
|
||||
vocab_size = 943 ; num_labels = 129 ; num_intents = 26 # number of words in vocab, slot labels, and intent labels
|
||||
|
||||
model_dir = "./Models"
|
||||
|
|
|
@ -139,7 +139,7 @@ def create_inputs(vocab_dim):
|
|||
return input_sequence, label_sequence
|
||||
|
||||
# Creates and trains a character-level language model
|
||||
def train_lm(training_file, max_num_minibatches):
|
||||
def train_lm(training_file, epochs, max_num_minibatches):
|
||||
|
||||
# load the data and vocab
|
||||
data, char_to_ix, ix_to_char, data_size, vocab_dim = load_data_and_vocab(training_file)
|
||||
|
@ -168,46 +168,34 @@ def train_lm(training_file, max_num_minibatches):
|
|||
trainer = Trainer(z, (ce, errs), learner)
|
||||
|
||||
sample_freq = 1000
|
||||
epochs = 50
|
||||
minibatches_per_epoch = int((data_size / minibatch_size))
|
||||
minibatches = min(epochs * minibatches_per_epoch, max_num_minibatches)
|
||||
minibatches_per_epoch = min(data_size // minibatch_size, max_num_minibatches // epochs)
|
||||
|
||||
# print out some useful training information
|
||||
log_number_of_parameters(z) ; print()
|
||||
progress_printer = ProgressPrinter(freq=100, tag='Training')
|
||||
log_number_of_parameters(z)
|
||||
print ("Running %d epochs with %d minibatches per epoch" % (epochs, minibatches_per_epoch))
|
||||
print()
|
||||
|
||||
e = 0
|
||||
p = 0
|
||||
for i in range(0, minibatches):
|
||||
|
||||
if p + minibatch_size+1 >= data_size:
|
||||
p = 0
|
||||
e += 1
|
||||
model_filename = "models/shakespeare_epoch%d.dnn" % e
|
||||
z.save(model_filename)
|
||||
print("Saved model to '%s'" % model_filename)
|
||||
|
||||
# get the data
|
||||
features, labels = get_data(p, minibatch_size, data, char_to_ix, vocab_dim)
|
||||
progress_printer = ProgressPrinter(freq=100, tag='Training')
|
||||
|
||||
for e in range(0, epochs):
|
||||
# Specify the mapping of input variables in the model to actual minibatch data to be trained with
|
||||
# If it's the start of the data, we specify that we are looking at a new sequence (True)
|
||||
mask = [False]
|
||||
if p == 0:
|
||||
mask = [True]
|
||||
arguments = ({input_sequence : features, label_sequence : labels}, mask)
|
||||
trainer.train_minibatch(arguments)
|
||||
mask = [True]
|
||||
for b in range(0, minibatches_per_epoch):
|
||||
# get the data
|
||||
features, labels = get_data(b, minibatch_size, data, char_to_ix, vocab_dim)
|
||||
arguments = ({input_sequence : features, label_sequence : labels}, mask)
|
||||
mask = [False]
|
||||
trainer.train_minibatch(arguments)
|
||||
|
||||
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
|
||||
|
||||
if i % sample_freq == 0:
|
||||
print(sample(z, ix_to_char, vocab_dim, char_to_ix))
|
||||
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
|
||||
global_minibatch = e*minibatches_per_epoch + b
|
||||
if global_minibatch % sample_freq == 0:
|
||||
print(sample(z, ix_to_char, vocab_dim, char_to_ix))
|
||||
|
||||
p += minibatch_size
|
||||
|
||||
# Do a final save of the model
|
||||
model_filename = "models/shakespeare_epoch%d.dnn" % e
|
||||
z.save(model_filename)
|
||||
model_filename = "models/shakespeare_epoch%d.dnn" % (e+1)
|
||||
z.save_model(model_filename)
|
||||
print("Saved model to '%s'" % model_filename)
|
||||
|
||||
|
||||
def load_and_sample(model_filename, vocab_filename, prime_text='', use_hardmax=False, length=1000, temperature=1.0):
|
||||
|
@ -223,13 +211,13 @@ def load_and_sample(model_filename, vocab_filename, prime_text='', use_hardmax=F
|
|||
|
||||
return sample(model, ix_to_char, len(chars), char_to_ix, prime_text=prime_text, use_hardmax=use_hardmax, length=length, temperature=temperature)
|
||||
|
||||
def train_and_eval_char_rnn(max_num_minibatches=sys.maxsize):
|
||||
# train the LM
|
||||
train_lm("data/tinyshakespeare.txt", max_num_minibatches)
|
||||
def train_and_eval_char_rnn(epochs=50, max_num_minibatches=sys.maxsize):
|
||||
# train the LM
|
||||
train_lm("data/tinyshakespeare.txt", epochs, max_num_minibatches)
|
||||
|
||||
# load and sample
|
||||
text = "T"
|
||||
return load_and_sample("models/shakespeare_epoch0.dnn", "data/tinyshakespeare.txt.vocab", prime_text=text, use_hardmax=False, length=100, temperature=0.95)
|
||||
return load_and_sample("models/shakespeare_epoch%d.dnn" % (epochs), "data/tinyshakespeare.txt.vocab", prime_text=text, use_hardmax=False, length=100, temperature=0.95)
|
||||
|
||||
if __name__=='__main__':
|
||||
# Specify the target device to be used for computing, if you do not want to
|
||||
|
|
|
@ -23,7 +23,7 @@ from _cntk_py import set_computation_network_trace_level
|
|||
|
||||
# Paths relative to current python file.
|
||||
abs_path = os.path.dirname(os.path.abspath(__file__))
|
||||
data_path = os.path.join(abs_path, "..", "..", "Datasets", "UCF11")
|
||||
data_path = os.path.join(abs_path, "..", "..", "DataSets", "UCF11")
|
||||
model_path = os.path.join(abs_path, "Models")
|
||||
|
||||
# Define the reader for both training and evaluation action.
|
||||
|
@ -194,14 +194,14 @@ def conv3d_ucf11(train_reader, test_reader, max_epochs=30):
|
|||
lr_per_sample = [0.01]*10+[0.001]*10+[0.0001]
|
||||
lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
|
||||
momentum_time_constant = 4096
|
||||
mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant, epoch_size=epoch_size)
|
||||
mm_schedule = momentum_as_time_constant_schedule([momentum_time_constant], epoch_size=epoch_size)
|
||||
|
||||
# Instantiate the trainer object to drive the model training
|
||||
learner = momentum_sgd(z.parameters, lr_schedule, mm_schedule, True)
|
||||
trainer = Trainer(z, (ce, pe), learner)
|
||||
|
||||
log_number_of_parameters(z) ; print()
|
||||
progress_printer = ProgressPrinter(tag='Training')
|
||||
progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
|
||||
|
||||
# Get minibatches of images to train with and perform model training
|
||||
for epoch in range(max_epochs): # loop over epochs
|
||||
|
|
40
Makefile
40
Makefile
|
@ -77,7 +77,10 @@ endif
|
|||
|
||||
# The mpic++ wrapper only adds MPI specific flags to the g++ command line.
|
||||
# The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
|
||||
ifneq ($(HAS_MPI),0)
|
||||
CXX = $(MPI_PATH)/bin/mpic++
|
||||
endif
|
||||
|
||||
SSE_FLAGS = -msse4.1 -mssse3
|
||||
|
||||
PROTOC = $(PROTOBUF_PATH)/bin/protoc
|
||||
|
@ -90,8 +93,8 @@ SOURCEDIR:= Source
|
|||
INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2LibraryDll/API CNTKv2LibraryDll/proto Math CNTK ActionsLib ComputationNetworkLib SGDLib SequenceTrainingLib CNTK/BrainScript Readers/ReaderLib PerformanceProfilerDll)
|
||||
INCLUDEPATH+=$(PROTOBUF_PATH)/include
|
||||
# COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
|
||||
COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
|
||||
CPPFLAGS:=
|
||||
COMMON_FLAGS:= -DHAS_MPI=$(HAS_MPI) -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
|
||||
CPPFLAGS:=
|
||||
CXXFLAGS:= $(SSE_FLAGS) -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
|
||||
LIBPATH:=
|
||||
LIBS_LIST:=
|
||||
|
@ -270,7 +273,7 @@ RPATH=-Wl,-rpath,
|
|||
# Build info
|
||||
########################################
|
||||
|
||||
BUILDINFO:= $(SOURCEDIR)/CNTK/buildinfo.h
|
||||
BUILDINFO:= $(SOURCEDIR)/CNTKv2LibraryDll/buildinfo.h
|
||||
GENBUILD:=Tools/generate_build_info
|
||||
|
||||
BUILDINFO_OUTPUT := $(shell $(GENBUILD) $(BUILD_TOP)/Config.make && echo Success)
|
||||
|
@ -579,9 +582,16 @@ $(EVAL_EXTENDED_CLIENT): $(EVAL_EXTENDED_CLIENT_OBJ) | $(EVAL_LIB) $(READER_LIBS
|
|||
########################################
|
||||
CNTKLIBRARY_CPP_EVAL_EXAMPLES:=$(BINDIR)/CNTKLibraryCPPEvalExamples
|
||||
|
||||
#ifdef CUDA_PATH
|
||||
CNTKLIBRARY_CPP_EVAL_EXAMPLES_SRC=\
|
||||
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.cpp \
|
||||
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalExamples/EvalMultithreads.cpp
|
||||
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalGPUExamples/CNTKLibraryCPPEvalGPUExamples.cpp\
|
||||
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/EvalMultithreads.cpp
|
||||
|
||||
#else
|
||||
CNTKLIBRARY_CPP_EVAL_EXAMPLES_SRC=\
|
||||
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/CNTKLibraryCPPEvalCPUOnlyExamples.cpp\
|
||||
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/EvalMultithreads.cpp
|
||||
#endif
|
||||
|
||||
CNTKLIBRARY_CPP_EVAL_EXAMPLES_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_CPP_EVAL_EXAMPLES_SRC))
|
||||
|
||||
|
@ -594,6 +604,26 @@ $(CNTKLIBRARY_CPP_EVAL_EXAMPLES): $(CNTKLIBRARY_CPP_EVAL_EXAMPLES_OBJ) | $(CNTKL
|
|||
@echo building $(CNTKLIBRARY_CPP_EVAL_EXAMPLES) for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) $(L_READER_LIBS)
|
||||
|
||||
########################################
|
||||
# Eval V2 Sample test
|
||||
########################################
|
||||
CNTKLIBRARY_CPP_EVAL_TEST:=$(BINDIR)/CNTKLibraryCPPEvalExamplesTest
|
||||
|
||||
CNTKLIBRARY_CPP_EVAL_TEST_SRC=\
|
||||
$(SOURCEDIR)/../Tests/EndToEndTests/EvalClientTests/CNTKLibraryCPPEvalExamplesTest/CNTKLibraryCPPEvalExamplesTest.cpp\
|
||||
$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/EvalMultithreads.cpp\
|
||||
$(SOURCEDIR)/../Tests/EndToEndTests/CNTKv2Library/Common/Common.cpp
|
||||
|
||||
CNTKLIBRARY_CPP_EVAL_TEST_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_CPP_EVAL_TEST_SRC))
|
||||
|
||||
ALL+=$(CNTKLIBRARY_CPP_EVAL_TEST)
|
||||
SRC+=$(CNTKLIBRARY_CPP_EVAL_TEST_SRC)
|
||||
|
||||
$(CNTKLIBRARY_CPP_EVAL_TEST): $(CNTKLIBRARY_CPP_EVAL_TEST_OBJ) | $(CNTKLIBRARY_LIB) $(READER_LIBS)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $(CNTKLIBRARY_CPP_EVAL_TEST) for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) $(L_READER_LIBS)
|
||||
|
||||
########################################
|
||||
# HTKMLFReader plugin
|
||||
########################################
|
||||
|
|
|
@ -256,9 +256,10 @@ void DoWriteOutput(const ConfigParameters& config)
|
|||
else if (config.Exists("outputPath"))
|
||||
{
|
||||
wstring outputPath = config(L"outputPath");
|
||||
bool writeSequenceKey = config(L"writeSequenceKey", false);
|
||||
WriteFormattingOptions formattingOptions(config);
|
||||
bool nodeUnitTest = config(L"nodeUnitTest", "false");
|
||||
writer.WriteOutput(testDataReader, mbSize[0], outputPath, outputNodeNamesVector, formattingOptions, epochSize, nodeUnitTest);
|
||||
writer.WriteOutput(testDataReader, mbSize[0], outputPath, outputNodeNamesVector, formattingOptions, epochSize, nodeUnitTest, writeSequenceKey);
|
||||
}
|
||||
else
|
||||
InvalidArgument("write command: You must specify either 'writer'or 'outputPath'");
|
||||
|
|
|
@ -164,12 +164,15 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
|
|||
else if (EqualInsensitive(nodeType, OperationNameOf(EqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterEqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(ForwardBackwardNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(LabelsToGraphNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(LessEqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(LessNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(NotEqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(ClipNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(ConvolutionNode), L"Convolve")) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(CropNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(PassNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceNode), L"CosDist")) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceWithNegativeSamplesNode), L"CosWithNegSamples")) ret = true;
|
||||
|
|
|
@ -59,12 +59,19 @@ shared_ptr<C> CreateObject(const ScriptableObjects::IConfigRecord& config, const
|
|||
template <class C>
|
||||
shared_ptr<C> CreateObject(const ConfigParameters& config, const wchar_t* id)
|
||||
{
|
||||
ConfigParameters readerConfig(config(id));
|
||||
if (!readerConfig.ExistsCurrent("traceLevel")) // do not overwrite "traceLevel" if it's already present
|
||||
ConfigParameters objConfig(config(id));
|
||||
const auto& readerType = string(objConfig("readerType", ""));
|
||||
if (objConfig.ExistsCurrent("traceLevel") || // do not overwrite "traceLevel" if it's already present
|
||||
AreEqualIgnoreCase(readerType, "CNTKTextFormatReader") || // do not overwrite "traceLevel" when creating a CTF reader
|
||||
AreEqualIgnoreCase(readerType, "CNTKBinaryReader")) // do not overwrite "traceLevel" when creating a binary reader
|
||||
{
|
||||
readerConfig.Insert("traceLevel", config(L"traceLevel", "0")); // TODO: fix this by adding it to all config blocks. Easy to fix in BS as 'config with [ traceLevel = 0 ]'.
|
||||
return make_shared<C>(objConfig);
|
||||
}
|
||||
return make_shared<C>(readerConfig); // old CNTK config specifies a dictionary which then must be explicitly instantiated
|
||||
|
||||
// If the config does not specify a 'traceLevel', the following line
|
||||
// will insert it with the value of 0.
|
||||
objConfig.Insert("traceLevel", config(L"traceLevel", "0")); // TODO: fix this by adding it to all config blocks. Easy to fix in BS as 'config with [ traceLevel = 0 ]'.
|
||||
return make_shared<C>(objConfig); // old CNTK config specifies a dictionary which then must be explicitly instantiated
|
||||
}
|
||||
|
||||
template <class ConfigRecordType, typename ElemType>
|
||||
|
|
|
@ -577,6 +577,9 @@ Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag=
|
|||
RowSlice(beginIndex, numRows, input, tag='') = Slice(beginIndex, beginIndex + numRows, input, axis = 1)
|
||||
RowRepeat(input, numRepeats, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = _AsNodes (input) /*plus the function args*/ ]
|
||||
RowStack(inputs, axis=1, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]
|
||||
EditDistanceError(leftInput, rightInput, subPen=0.0, delPen=0.0, insPen=0.0, squashInputs=false, tokensToIgnore=[||], tag='') = new ComputationNode [ operation = 'EditDistanceError' ; inputs = _AsNodes (leftInput : rightInput) /*plus the function args*/ ]
|
||||
ForwardBackward(graph, features, blankTokenId, delayConstraint=-1, tag='') = new ComputationNode [ operation = 'ForwardBackward' ; inputs = _AsNodes (graph : features) /*plus the function args*/ ]
|
||||
LabelsToGraph(labels, tag='') = new ComputationNode [ operation = 'LabelsToGraph' ; inputs = _AsNodes (labels) /*plus the function args*/ ]
|
||||
Slice(beginIndex, endIndex, input, axis=1, tag='') =
|
||||
if axis < 0 then [ # time axis: specify -1
|
||||
beginFlags = if beginIndex > 0 then BS.Boolean.Not (BS.Loop.IsFirstN (beginIndex, input)) else BS.Loop.IsLastN (-beginIndex, input)
|
||||
|
|
|
@ -36,6 +36,7 @@
|
|||
#include "BrainScriptEvaluator.h"
|
||||
#include "BrainScriptParser.h"
|
||||
#include "PerformanceProfiler.h"
|
||||
#include "CNTKLibrary.h"
|
||||
|
||||
#include <string>
|
||||
#include <chrono>
|
||||
|
@ -252,9 +253,6 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
|
|||
ProgressTracing::SetStepOffset(fullEpochsOffset); // this is the epoch number that SGD will log relative to
|
||||
}
|
||||
|
||||
if (Globals::ShouldEnableHyperCompressMemory())
|
||||
Matrix<ElemType>::UseCachedResizeOrNot(true);
|
||||
|
||||
// determine the action to perform, and do it
|
||||
for (int j = 0; j < action.size(); j++)
|
||||
{
|
||||
|
@ -372,55 +370,6 @@ std::string TimeDateStamp()
|
|||
return buf;
|
||||
}
|
||||
|
||||
void PrintBuiltInfo()
|
||||
{
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
LOGPRINTF(stderr, "Build info: \n\n");
|
||||
LOGPRINTF(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
|
||||
LOGPRINTF(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
|
||||
#ifdef _BUILDTYPE_
|
||||
LOGPRINTF(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
|
||||
#endif
|
||||
#ifdef _BUILDTARGET_
|
||||
LOGPRINTF(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
|
||||
#endif
|
||||
#ifdef _WITH_1BITSGD_
|
||||
LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
|
||||
#endif
|
||||
#ifdef _WITH_ASGD_
|
||||
LOGPRINTF(stderr, "\t\tWith ASGD: %s\n", _WITH_ASGD_);
|
||||
#endif
|
||||
#ifdef _MATHLIB_
|
||||
LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
|
||||
#endif
|
||||
#ifdef _CUDA_PATH_
|
||||
LOGPRINTF(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
|
||||
#endif
|
||||
#ifdef _CUB_PATH_
|
||||
LOGPRINTF(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
|
||||
#endif
|
||||
#ifdef _CUDNN_PATH_
|
||||
LOGPRINTF(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
|
||||
#endif
|
||||
#ifdef _GIT_EXIST
|
||||
LOGPRINTF(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
|
||||
LOGPRINTF(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
|
||||
#endif
|
||||
#ifdef _BUILDER_
|
||||
LOGPRINTF(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
|
||||
#endif
|
||||
#ifdef _BUILDPATH_
|
||||
LOGPRINTF(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
|
||||
#endif
|
||||
#ifdef _MPI_NAME_
|
||||
LOGPRINTF(stderr, "\t\tMPI distribution: %s\n", _MPI_NAME_);
|
||||
#endif
|
||||
#ifdef _MPI_VERSION_
|
||||
LOGPRINTF(stderr, "\t\tMPI version: %s\n", _MPI_VERSION_);
|
||||
#endif
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
}
|
||||
|
||||
void PrintUsageInfo()
|
||||
{
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
|
@ -585,7 +534,6 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
|
||||
Globals::SetShareNodeValueMatrices(config(L"shareNodeValueMatrices", true));
|
||||
Globals::SetGradientAccumulationOptimization(config(L"optimizeGradientAccumulation", true));
|
||||
Globals::SetHyperCompressMemory(config(L"hyperCompressMemory", false));
|
||||
|
||||
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
|
||||
|
||||
|
@ -598,7 +546,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
|
||||
RedirectStdErr(logpath);
|
||||
LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
|
||||
PrintBuiltInfo();
|
||||
::CNTK::PrintBuiltInfo();
|
||||
}
|
||||
|
||||
// echo gpu info to log
|
||||
|
@ -666,7 +614,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
|
||||
static void PrintBanner(int argc, wchar_t* argv[], const string& timestamp)
|
||||
{
|
||||
fprintf(stderr, "CNTK 2.0.beta11.0+ (");
|
||||
fprintf(stderr, "CNTK 2.0.beta11.0 (");
|
||||
#ifdef _GIT_EXIST
|
||||
fprintf(stderr, "%s %.6s, ", _BUILDBRANCH_, _BUILDSHA1_);
|
||||
#endif
|
||||
|
@ -729,7 +677,6 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
|
|||
|
||||
Globals::SetShareNodeValueMatrices(config(L"shareNodeValueMatrices", true));
|
||||
Globals::SetGradientAccumulationOptimization(config(L"optimizeGradientAccumulation", true));
|
||||
Globals::SetHyperCompressMemory(config(L"hyperCompressMemory", false));
|
||||
|
||||
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
|
||||
|
||||
|
@ -764,7 +711,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
|
|||
}
|
||||
|
||||
// full config info
|
||||
PrintBuiltInfo();
|
||||
::CNTK::PrintBuiltInfo();
|
||||
PrintGpuInfo();
|
||||
|
||||
#ifdef _DEBUG
|
||||
|
@ -857,7 +804,7 @@ int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper th
|
|||
{
|
||||
if (argc <= 1)
|
||||
{
|
||||
PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
|
||||
::CNTK::PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
|
||||
LOGPRINTF(stderr, "No command-line argument given.\n");
|
||||
PrintUsageInfo();
|
||||
fflush(stderr);
|
||||
|
|
|
@ -85,7 +85,8 @@
|
|||
<StackReserveSize>100000000</StackReserveSize>
|
||||
</Link>
|
||||
<PreBuildEvent>
|
||||
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
|
||||
<Command>
|
||||
</Command>
|
||||
</PreBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(ReleaseBuild)">
|
||||
|
@ -113,7 +114,8 @@
|
|||
<StackReserveSize>100000000</StackReserveSize>
|
||||
</Link>
|
||||
<PreBuildEvent>
|
||||
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
|
||||
<Command>
|
||||
</Command>
|
||||
</PreBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
|
||||
|
|
|
@ -396,8 +396,16 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
|
|||
MELProperty prop = melPropNull;
|
||||
#if 1 // legacy
|
||||
// legacy names for some properties
|
||||
if (EqualInsensitive(propName, "finalCriterion", "Criteria")) propName = "criterion";
|
||||
else if (EqualInsensitive(propName, "eval")) propName = "evaluation";
|
||||
if (EqualInsensitive(propName, "finalCriterion", "Criteria"))
|
||||
{
|
||||
propName = "criterion";
|
||||
prop = melPropFinalCriterion;
|
||||
}
|
||||
else if (EqualInsensitive(propName, "eval"))
|
||||
{
|
||||
propName = "evaluation";
|
||||
prop = melPropEvaluation;
|
||||
}
|
||||
// legacy property that now works differently
|
||||
else if (EqualInsensitive(propName, "needGradient", "needsGradient") || EqualInsensitive(propName, "computeGradient"))
|
||||
prop = melPropParameterUpdateRequired; // for backward compatibility
|
||||
|
|
|
@ -1395,6 +1395,18 @@ namespace CNTK
|
|||
|
||||
CNTK_API void Add(const Dictionary& other);
|
||||
|
||||
void Add(const std::wstring& key, const DictionaryValue& value)
|
||||
{
|
||||
operator[](key.c_str()) = value;
|
||||
}
|
||||
|
||||
template<typename... Args>
|
||||
void Add(const std::wstring& key, const DictionaryValue& value, Args... args)
|
||||
{
|
||||
Add(key, value); //insert one
|
||||
Add(args...); //recurse
|
||||
}
|
||||
|
||||
CNTK_API bool operator==(const Dictionary& other) const;
|
||||
CNTK_API bool operator!=(const Dictionary& other) const;
|
||||
|
||||
|
@ -1634,6 +1646,8 @@ private:
|
|||
|
||||
Variable CompositePreservingCopy(const std::shared_ptr<const Function>& composite) const;
|
||||
|
||||
Variable NonCompositePreservingCopy() const;
|
||||
|
||||
private:
|
||||
#ifdef SWIGCSHARP
|
||||
public:
|
||||
|
@ -2735,7 +2749,7 @@ namespace CNTK
|
|||
|
||||
///
|
||||
/// Returns the root of the Function graph underlying this block Function.
|
||||
/// Throws an exception ff this is not a block Function
|
||||
/// Throws an exception of this is not a block Function
|
||||
///
|
||||
CNTK_API FunctionPtr BlockRoot() const;
|
||||
|
||||
|
@ -4430,6 +4444,20 @@ namespace CNTK
|
|||
std::wstring m_streamAlias;
|
||||
};
|
||||
|
||||
struct HTKFeatureConfiguration
|
||||
{
|
||||
HTKFeatureConfiguration(const std::wstring& streamName, const std::wstring& scp, size_t dim, size_t left, size_t right, bool broadcast)
|
||||
: m_streamName(streamName), m_dim(dim), m_scp(scp), m_left(left), m_right(right), m_broadcast(broadcast)
|
||||
{}
|
||||
|
||||
std::wstring m_streamName;
|
||||
std::wstring m_scp;
|
||||
size_t m_dim;
|
||||
size_t m_left;
|
||||
size_t m_right;
|
||||
bool m_broadcast;
|
||||
};
|
||||
|
||||
///
|
||||
/// Instantiate the CNTK built-in text format minibatch source
|
||||
///
|
||||
|
@ -4475,6 +4503,56 @@ namespace CNTK
|
|||
return CreateCompositeMinibatchSource(minibatchSourceConfiguration);
|
||||
}
|
||||
|
||||
typedef Dictionary ImageTransform;
|
||||
|
||||
///
|
||||
/// Create a crop transform with the specified options to be used with a reader
|
||||
///
|
||||
CNTK_API ImageTransform ReaderCrop(const wchar_t* cropType = L"center",
|
||||
int cropSize = 0, float sideRatio = 0.0f, float areaRatio = 0.0f,
|
||||
float aspectRatio = 1.0f, const wchar_t* jitterType = L"none");
|
||||
|
||||
///
|
||||
/// Create a scale transform with the specified options to be used with a reader
|
||||
///
|
||||
CNTK_API ImageTransform ReaderScale(int width,
|
||||
int height, int channels, const wchar_t* interpolations = L"linear",
|
||||
const wchar_t* scaleMode = L"fill", int padValue = -1);
|
||||
|
||||
///
|
||||
/// Create a mean subtraction transform with the specified options to be used with a reader
|
||||
///
|
||||
CNTK_API ImageTransform ReaderMean(const wchar_t* meanFile);
|
||||
|
||||
///
|
||||
/// Create a color transform with the specified options to be used with a reader
|
||||
///
|
||||
CNTK_API ImageTransform ReaderColor(float brightnessRadius = 0.0f,
|
||||
float contrastRadius = 0.0f, float saturationRadius = 0.0f);
|
||||
|
||||
|
||||
typedef Dictionary Deserializer;
|
||||
|
||||
///
|
||||
/// Create an ImageDeserializer with the specified options
|
||||
///
|
||||
CNTK_API Deserializer ImageDeserializer(const std::wstring& fileName, const std::wstring& labelStreamName, size_t numLabels, const std::wstring& imageStreamName, const std::vector<ImageTransform>& transforms = {});
|
||||
|
||||
///
|
||||
/// Create an CTFDeserializer with the specified options
|
||||
///
|
||||
CNTK_API Deserializer CTFDeserializer(const std::wstring& fileName, const std::vector<StreamConfiguration>& streams);
|
||||
|
||||
///
|
||||
/// Create an HTKFeatureDeserializer with the specified options
|
||||
///
|
||||
CNTK_API Deserializer HTKFeatureDeserializer(const std::vector<HTKFeatureConfiguration>& streams);
|
||||
|
||||
///
|
||||
/// Create an HTKMLFDeserializer with the specified options
|
||||
///
|
||||
CNTK_API Deserializer HTKMLFDeserializer(const std::wstring& streamName, const std::wstring& labelMappingFile, size_t dimension, const std::vector<std::wstring>& mlfFiles);
|
||||
|
||||
///
|
||||
/// Compute the per dimension means and variances for each of the specified streams using data from the specified minibatchSource.
|
||||
///
|
||||
|
@ -4769,6 +4847,9 @@ namespace CNTK
|
|||
bool keepExistingCheckpoints = false,
|
||||
size_t maxNumberOfTrainingSamples = std::numeric_limits<size_t>::max(),
|
||||
size_t progressFrequency = std::numeric_limits<size_t>::max());
|
||||
|
||||
|
||||
CNTK_API void PrintBuiltInfo();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -250,9 +250,6 @@ namespace CNTK
|
|||
CNTK_API void EnableForwardValuesSharing();
|
||||
CNTK_API void DisableForwardValuesSharing();
|
||||
|
||||
CNTK_API void EnableHyperMemoryCompress();
|
||||
CNTK_API void DisableHyperMemoryCompress();
|
||||
|
||||
CNTK_API void EnableGradientAccumulationOptimization();
|
||||
CNTK_API void DisableGradientAccumulationOptimization();
|
||||
|
||||
|
|
|
@ -144,6 +144,8 @@ namespace CNTK
|
|||
opType = PrimitiveOpType::Sin;
|
||||
else if (node->OperationName() == OperationNameOf(PassNode))
|
||||
opType = PrimitiveOpType::Pass;
|
||||
else if (node->OperationName() == OperationNameOf(LabelsToGraphNode))
|
||||
opType = PrimitiveOpType::LabelsToGraph;
|
||||
else if (node->OperationName() == OperationNameOf(RectifiedLinearNode))
|
||||
opType = PrimitiveOpType::ReLU;
|
||||
else if (node->OperationName() == OperationNameOf(ExpNode))
|
||||
|
@ -450,7 +452,7 @@ namespace CNTK
|
|||
primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameDeletionPenalty] = edNode->DeletionPenalty();
|
||||
primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameSubstitutionPenalty] = edNode->SubstitutionPenalty();
|
||||
primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameSquashInputs] = edNode->SquashInputs();
|
||||
primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameSamplesToIgnore] = AsDictionaryValueVector(edNode->SamplesToIgnore());
|
||||
primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameTokensToIgnore] = AsDictionaryValueVector(edNode->TokensToIgnore());
|
||||
|
||||
opType = PrimitiveOpType::EditDistanceError;
|
||||
}
|
||||
|
|
|
@ -106,6 +106,12 @@
|
|||
<DelayLoadDLLs>Math.dll; msmpi.dll; PerformanceProfilerDll.dll </DelayLoadDLLs>
|
||||
<OptimizeReferences Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">false</OptimizeReferences>
|
||||
</Link>
|
||||
<PreBuildEvent>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
|
||||
</PreBuildEvent>
|
||||
<PreBuildEvent>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
|
||||
</PreBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(GpuBuild)">
|
||||
<ClCompile>
|
||||
|
@ -118,6 +124,15 @@
|
|||
<Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" "$(TargetDir)"</Command>
|
||||
<Message>Copying NVidia GDK extension DLL to target folder</Message>
|
||||
</PostBuildEvent>
|
||||
<PreBuildEvent>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
|
||||
</PreBuildEvent>
|
||||
<PreBuildEvent>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release_NoOpt|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
|
||||
</PreBuildEvent>
|
||||
<PreBuildEvent>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
|
||||
</PreBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="API\CNTKLibrary.h" />
|
||||
|
|
|
@ -17,8 +17,11 @@
|
|||
#include "PerformanceProfiler.h"
|
||||
#include "MPIWrapper.h"
|
||||
#include "Basics.h"
|
||||
#include "ProgressTracing.h"
|
||||
#include "buildinfo.h"
|
||||
|
||||
extern bool g_shareNodeValueMatrices;
|
||||
using namespace Microsoft::MSR::CNTK;
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
|
@ -84,16 +87,6 @@ namespace CNTK
|
|||
Microsoft::MSR::CNTK::Globals::SetShareNodeValueMatrices(/* enable = */ false);
|
||||
}
|
||||
|
||||
void EnableHyperMemoryCompress()
|
||||
{
|
||||
Microsoft::MSR::CNTK::Globals::SetHyperCompressMemory(/* enable = */ true);
|
||||
}
|
||||
|
||||
void DisableHyperMemoryCompress()
|
||||
{
|
||||
Microsoft::MSR::CNTK::Globals::SetHyperCompressMemory(/* enable = */ false);
|
||||
}
|
||||
|
||||
void EnableGradientAccumulationOptimization()
|
||||
{
|
||||
Microsoft::MSR::CNTK::Globals::SetGradientAccumulationOptimization(/* enable = */ true);
|
||||
|
@ -617,6 +610,56 @@ namespace CNTK
|
|||
va_end(args);
|
||||
}
|
||||
|
||||
|
||||
void PrintBuiltInfo()
|
||||
{
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
LOGPRINTF(stderr, "Build info: \n\n");
|
||||
LOGPRINTF(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
|
||||
LOGPRINTF(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
|
||||
#ifdef _BUILDTYPE_
|
||||
LOGPRINTF(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
|
||||
#endif
|
||||
#ifdef _BUILDTARGET_
|
||||
LOGPRINTF(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
|
||||
#endif
|
||||
#ifdef _WITH_1BITSGD_
|
||||
LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
|
||||
#endif
|
||||
#ifdef _WITH_ASGD_
|
||||
LOGPRINTF(stderr, "\t\tWith ASGD: %s\n", _WITH_ASGD_);
|
||||
#endif
|
||||
#ifdef _MATHLIB_
|
||||
LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
|
||||
#endif
|
||||
#ifdef _CUDA_PATH_
|
||||
LOGPRINTF(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
|
||||
#endif
|
||||
#ifdef _CUB_PATH_
|
||||
LOGPRINTF(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
|
||||
#endif
|
||||
#ifdef _CUDNN_PATH_
|
||||
LOGPRINTF(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
|
||||
#endif
|
||||
#ifdef _GIT_EXIST
|
||||
LOGPRINTF(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
|
||||
LOGPRINTF(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
|
||||
#endif
|
||||
#ifdef _BUILDER_
|
||||
LOGPRINTF(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
|
||||
#endif
|
||||
#ifdef _BUILDPATH_
|
||||
LOGPRINTF(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
|
||||
#endif
|
||||
#ifdef _MPI_NAME_
|
||||
LOGPRINTF(stderr, "\t\tMPI distribution: %s\n", _MPI_NAME_);
|
||||
#endif
|
||||
#ifdef _MPI_VERSION_
|
||||
LOGPRINTF(stderr, "\t\tMPI version: %s\n", _MPI_VERSION_);
|
||||
#endif
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
}
|
||||
|
||||
template CNTK_API __declspec_noreturn void ThrowFormatted<std::runtime_error>(const char* format, ...);
|
||||
template CNTK_API __declspec_noreturn void ThrowFormatted<std::logic_error>(const char* format, ...);
|
||||
template CNTK_API __declspec_noreturn void ThrowFormatted<std::invalid_argument>(const char* format, ...);
|
||||
|
|
|
@ -721,8 +721,8 @@ namespace CNTK
|
|||
auto delPen = functionConfig[PrimitiveFunction::AttributeNameDeletionPenalty].Value<float>();
|
||||
auto insPen = functionConfig[PrimitiveFunction::AttributeNameInsertionPenalty].Value<float>();
|
||||
auto squashInputs = functionConfig[PrimitiveFunction::AttributeNameSquashInputs].Value<bool>();
|
||||
auto samplesToIgnore = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNameSamplesToIgnore].Value<std::vector<DictionaryValue>>());
|
||||
computationNodePtr = New<EditDistanceErrorNode<ElementType>>(network->GetDeviceId(), subPen, delPen, insPen, squashInputs, samplesToIgnore, internalNodeName);
|
||||
auto tokensToIgnore = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNameTokensToIgnore].Value<std::vector<DictionaryValue>>());
|
||||
computationNodePtr = New<EditDistanceErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore);
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::LambdaRank:
|
||||
|
@ -813,6 +813,9 @@ namespace CNTK
|
|||
case PrimitiveOpType::Pass:
|
||||
computationNodePtr = New<PassNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
case PrimitiveOpType::LabelsToGraph:
|
||||
computationNodePtr = New<LabelsToGraphNode<ElementType>>(network->GetDeviceId(), internalNodeName);
|
||||
break;
|
||||
default:
|
||||
CNTK::LogicError("Specified op %S not yet supported", PrimitiveOpTypeName(op).c_str());
|
||||
break;
|
||||
|
@ -932,6 +935,18 @@ namespace CNTK
|
|||
return computationNodePtr;
|
||||
}
|
||||
|
||||
std::unordered_set<Variable> CompositeFunction::NonOwnerPreservingCopy(const std::unordered_set<Variable>& outputs)
|
||||
{
|
||||
std::unordered_set<Variable> result;
|
||||
for (auto& o : outputs)
|
||||
{
|
||||
Variable sanitized = o.NonCompositePreservingCopy();
|
||||
result.insert(sanitized);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
ComputationNetworkPtr CompositeFunction::GetComputationNetwork(const DeviceDescriptor& device,
|
||||
const std::unordered_set<Variable>& backpropRoots,
|
||||
|
@ -941,7 +956,7 @@ namespace CNTK
|
|||
{
|
||||
if (m_computationNetwork != nullptr)
|
||||
{
|
||||
// TODO: We should either invalidate and readapt the network if he backpropRoots change compared to what was specified when the network
|
||||
// TODO: We should either invalidate and readapt the network if the backpropRoots change compared to what was specified when the network
|
||||
// was last constructed, to just recreate a new network.
|
||||
// For now just disallow changing the backpropRoots after the network is created
|
||||
if (!backpropRoots.empty() && (m_currentBackpropRoots != backpropRoots))
|
||||
|
@ -966,7 +981,7 @@ namespace CNTK
|
|||
InvalidArgument("Function::Forward: Only inputs of a Function can be excluded from gradient computation");
|
||||
}
|
||||
|
||||
m_inputsExcludedFromGradientComputation = inputsToExcludeGradientsFor;
|
||||
m_inputsExcludedFromGradientComputation = NonOwnerPreservingCopy(inputsToExcludeGradientsFor);
|
||||
|
||||
ComputationNetworkBuilder<ElementType> builder(*m_computationNetwork);
|
||||
|
||||
|
@ -1023,7 +1038,7 @@ namespace CNTK
|
|||
}
|
||||
}
|
||||
|
||||
m_currentBackpropRoots = backpropRoots;
|
||||
m_currentBackpropRoots = NonOwnerPreservingCopy(backpropRoots);
|
||||
|
||||
// In case of recurrence, the inputs of some of the ComputationNodes are not attached due to cycles.
|
||||
// Now attach those after we have created all ComputationNodes in the network
|
||||
|
@ -1317,10 +1332,12 @@ namespace CNTK
|
|||
{
|
||||
if (m_perOutputVarArgumentDependencies.find(output) == m_perOutputVarArgumentDependencies.end())
|
||||
{
|
||||
if (output.IsOutput())
|
||||
m_perOutputVarArgumentDependencies[output] = AsComposite(output.Owner())->Arguments();
|
||||
auto sanitizedOutput = output.NonCompositePreservingCopy();
|
||||
|
||||
if (sanitizedOutput.IsOutput())
|
||||
m_perOutputVarArgumentDependencies[sanitizedOutput] = AsComposite(sanitizedOutput.Owner())->Arguments();
|
||||
else
|
||||
m_perOutputVarArgumentDependencies[output] = { output };
|
||||
m_perOutputVarArgumentDependencies[sanitizedOutput] = { sanitizedOutput };
|
||||
}
|
||||
|
||||
return m_perOutputVarArgumentDependencies[output];
|
||||
|
@ -1381,12 +1398,13 @@ namespace CNTK
|
|||
std::unordered_set<Variable> functionOutputs(m_outputs.begin(), m_outputs.end());
|
||||
std::vector<ComputationNodeBasePtr> outputsToEvaluate;
|
||||
std::unordered_set<Variable> requiredArguments;
|
||||
for (auto outputVarValuePair : outputs)
|
||||
|
||||
for (auto outputVariable : requestedOutputVariables)
|
||||
{
|
||||
auto& requiredArgumentsForCurrentOutput = GetArgumentDependencies(outputVarValuePair.first);
|
||||
auto& requiredArgumentsForCurrentOutput = GetArgumentDependencies(outputVariable);
|
||||
requiredArguments.insert(requiredArgumentsForCurrentOutput.begin(), requiredArgumentsForCurrentOutput.end());
|
||||
|
||||
auto outputComputationNode = m_variableToNodeMap.at(outputVarValuePair.first);
|
||||
auto outputComputationNode = m_variableToNodeMap.at(outputVariable);
|
||||
outputsToEvaluate.push_back(outputComputationNode);
|
||||
}
|
||||
|
||||
|
|
|
@ -33,6 +33,13 @@ namespace CNTK
|
|||
class CompositeFunction;
|
||||
typedef std::shared_ptr<CompositeFunction> CompositeFunctionPtr;
|
||||
|
||||
///
|
||||
/// Represents a symbolic computation with zero or more input arguments and one or more outputs.
|
||||
/// Opposed to primitive functions, a composite function is composed of other Function instances whose inputs and outputs are wired together.
|
||||
/// CompositeFunction is also responsible for breaking the loop in case of cyclic graphs - it stores the pointers for to the child primitive
|
||||
/// functions and controls their lifetime.
|
||||
/// CompositeFunction class inherits thus from Function.
|
||||
///
|
||||
class CompositeFunction final : public Function
|
||||
{
|
||||
friend class Function;
|
||||
|
@ -258,6 +265,9 @@ namespace CNTK
|
|||
void GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs);
|
||||
void GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients);
|
||||
|
||||
// Remove cyclic references for composite nodes
|
||||
static std::unordered_set<Variable> NonOwnerPreservingCopy(const std::unordered_set<Variable>& outputs);
|
||||
|
||||
const std::vector<Variable>& GetArgumentDependencies(const Variable& output);
|
||||
|
||||
std::unordered_map<Variable, uint64_t> GetCurrentBackpropRootsTimeStamps() const;
|
||||
|
|
|
@ -340,16 +340,16 @@ namespace CNTK
|
|||
if (dataType == DataType::Float)
|
||||
{
|
||||
if (inputData == outputData)
|
||||
m_mpi->AllReduceAsync<float>(static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
|
||||
m_mpi->AllReduceAsync(static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
|
||||
else
|
||||
m_mpi->AllReduceAsync<float>(static_cast<float*>(inputData), static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
|
||||
m_mpi->AllReduceAsync(static_cast<float*>(inputData), static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
|
||||
}
|
||||
else if (dataType == DataType::Double)
|
||||
{
|
||||
if (inputData == outputData)
|
||||
m_mpi->AllReduceAsync<double>(static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
|
||||
m_mpi->AllReduceAsync(static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
|
||||
else
|
||||
m_mpi->AllReduceAsync<double>(static_cast<double*>(inputData), static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
|
||||
m_mpi->AllReduceAsync(static_cast<double*>(inputData), static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
|
||||
}
|
||||
else
|
||||
LogicError("Unknown DataType");
|
||||
|
|
|
@ -1078,14 +1078,14 @@ namespace CNTK
|
|||
}
|
||||
}
|
||||
|
||||
FunctionPtr EditDistanceError(const Variable& prediction, const Variable& labels, float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& samplesToIgnore, const std::wstring& name)
|
||||
FunctionPtr EditDistanceError(const Variable& prediction, const Variable& labels, float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& tokensToIgnore, const std::wstring& name)
|
||||
{
|
||||
auto additionalProperties = Dictionary();
|
||||
additionalProperties[PrimitiveFunction::AttributeNameSubstitutionPenalty] = subPen;
|
||||
additionalProperties[PrimitiveFunction::AttributeNameDeletionPenalty] = delPen;
|
||||
additionalProperties[PrimitiveFunction::AttributeNameInsertionPenalty] = insPen;
|
||||
additionalProperties[PrimitiveFunction::AttributeNameSquashInputs] = squashInputs;
|
||||
additionalProperties[PrimitiveFunction::AttributeNameSamplesToIgnore] = AsDictionaryValueVector(samplesToIgnore);
|
||||
additionalProperties[PrimitiveFunction::AttributeNameTokensToIgnore] = AsDictionaryValueVector(tokensToIgnore);
|
||||
|
||||
return BinaryOp(PrimitiveOpType::EditDistanceError, prediction, labels, std::move(additionalProperties), name);
|
||||
}
|
||||
|
|
|
@ -349,4 +349,117 @@ namespace CNTK
|
|||
m_epochEndReached = false;
|
||||
m_prevMinibatchSize = 0;
|
||||
}
|
||||
|
||||
/* static */ ImageTransform ReaderCrop(const wchar_t* cropType,
|
||||
int cropSize, float sideRatio, float areaRatio,
|
||||
float aspectRatio, const wchar_t* jitterType)
|
||||
{
|
||||
ImageTransform crop;
|
||||
crop.Add(L"type", L"Crop",
|
||||
L"cropType", cropType,
|
||||
L"cropSize", cropSize,
|
||||
L"sideRatio", sideRatio,
|
||||
L"areaRatio", areaRatio,
|
||||
L"aspectRatio", aspectRatio,
|
||||
L"jitterType", jitterType);
|
||||
return crop;
|
||||
}
|
||||
|
||||
/* static */ ImageTransform ReaderScale(int width,
|
||||
int height, int channels, const wchar_t* interpolations,
|
||||
const wchar_t* scaleMode, int padValue)
|
||||
{
|
||||
ImageTransform scale;
|
||||
scale.Add(L"type", L"Scale",
|
||||
L"width", width,
|
||||
L"height", height,
|
||||
L"channels", channels,
|
||||
L"interpolations", interpolations,
|
||||
L"scaleMode", scaleMode,
|
||||
L"padValue", padValue);
|
||||
return scale;
|
||||
}
|
||||
|
||||
/* static */ ImageTransform ReaderMean(const wchar_t* meanFile)
|
||||
{
|
||||
ImageTransform mean;
|
||||
mean.Add(L"type", L"Mean", L"meanFile", meanFile);
|
||||
return mean;
|
||||
}
|
||||
|
||||
/* static */ ImageTransform ReaderColor(float brightnessRadius,
|
||||
float contrastRadius, float saturationRadius)
|
||||
{
|
||||
ImageTransform color;
|
||||
color.Add(L"type", L"Color",
|
||||
L"brightnessRadius", brightnessRadius,
|
||||
L"contrastRadius", contrastRadius,
|
||||
L"saturationRadius", saturationRadius);
|
||||
return color;
|
||||
}
|
||||
|
||||
Deserializer ImageDeserializer(const std::wstring& fileName, const std::wstring& labelStreamName, size_t numLabels, const std::wstring& imageStreamName, const std::vector<ImageTransform>& transforms)
|
||||
{
|
||||
Deserializer img;
|
||||
std::vector<DictionaryValue> actualTransforms;
|
||||
std::transform(transforms.begin(), transforms.end(), std::back_inserter(actualTransforms), [](ImageTransform t) { return static_cast<DictionaryValue>(t); });
|
||||
Dictionary labeldim;
|
||||
labeldim[L"labelDim"] = numLabels;
|
||||
Dictionary xforms;
|
||||
xforms[L"transforms"] = actualTransforms;
|
||||
Dictionary input;
|
||||
input.Add(imageStreamName.c_str(), xforms, labelStreamName.c_str(), labeldim);
|
||||
img.Add(L"type", L"ImageDeserializer", L"file", fileName, L"input", input);
|
||||
return img;
|
||||
}
|
||||
|
||||
Deserializer CTFDeserializer(const std::wstring& fileName, const std::vector<StreamConfiguration>& streams)
|
||||
{
|
||||
Deserializer ctf;
|
||||
Dictionary input;
|
||||
for (const auto& s : streams)
|
||||
{
|
||||
const auto& key = s.m_streamName;
|
||||
Dictionary stream;
|
||||
stream.Add(L"alias", s.m_streamAlias, L"dim", s.m_dim, L"format", s.m_isSparse ? L"sparse" : L"dense");
|
||||
input[key] = stream;
|
||||
}
|
||||
ctf.Add(L"type", L"CNTKTextFormatDeserializer", L"file", fileName, L"input", input);
|
||||
return ctf;
|
||||
}
|
||||
|
||||
Deserializer HTKFeatureDeserializer(const std::vector<HTKFeatureConfiguration>& streams)
|
||||
{
|
||||
Deserializer htk;
|
||||
Dictionary input;
|
||||
for (const auto& s : streams)
|
||||
{
|
||||
const auto& key = s.m_streamName;
|
||||
Dictionary stream;
|
||||
std::vector<DictionaryValue> ctxWindow = { DictionaryValue(s.m_left), DictionaryValue(s.m_right) };
|
||||
stream.Add(L"scpFile", s.m_scp, L"dim", s.m_dim, L"contextWindow", ctxWindow, L"expandToUtterance", s.m_broadcast);
|
||||
input[key] = stream;
|
||||
}
|
||||
htk.Add(L"type", L"HTKFeatureDeserializer", L"input", input);
|
||||
return htk;
|
||||
}
|
||||
|
||||
Deserializer HTKMLFDeserializer(const std::wstring& streamName, const std::wstring& labelMappingFile, size_t dimension, const std::vector<std::wstring>& mlfFiles)
|
||||
{
|
||||
Deserializer htk;
|
||||
Dictionary stream;
|
||||
Dictionary labels;
|
||||
labels.Add(L"labelMappingFile", labelMappingFile, L"dim", dimension);
|
||||
std::vector<DictionaryValue> actualFiles;
|
||||
std::transform(mlfFiles.begin(), mlfFiles.end(), std::back_inserter(actualFiles), [](const std::wstring& s) {return static_cast<DictionaryValue>(s); });
|
||||
if (actualFiles.size() > 1)
|
||||
labels[L"mlfFileList"] = actualFiles;
|
||||
else if (actualFiles.size() == 1)
|
||||
labels[L"mlfFile"] = actualFiles[0];
|
||||
else
|
||||
LogicError("HTKMLFDeserializer: No mlf files were specified");
|
||||
stream[streamName] = labels;
|
||||
htk.Add(L"type", L"HTKMLFDeserializer", L"input", stream);
|
||||
return htk;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -79,7 +79,7 @@ namespace CNTK
|
|||
/*static*/ const std::wstring PrimitiveFunction::AttributeNameDeletionPenalty = L"DeletionPenalty";
|
||||
/*static*/ const std::wstring PrimitiveFunction::AttributeNameInsertionPenalty = L"InsertionPenalty";
|
||||
/*static*/ const std::wstring PrimitiveFunction::AttributeNameSquashInputs = L"SquashInputs";
|
||||
/*static*/ const std::wstring PrimitiveFunction::AttributeNameSamplesToIgnore = L"SamplesToIgnore";
|
||||
/*static*/ const std::wstring PrimitiveFunction::AttributeNameTokensToIgnore = L"TokensToIgnore";
|
||||
|
||||
/*static*/ DataType PrimitiveFunction::GetOutputDataType(PrimitiveOpType op, std::vector<Variable>& inputs, bool inferDimensions)
|
||||
{
|
||||
|
|
|
@ -235,7 +235,7 @@ namespace CNTK
|
|||
static const std::wstring AttributeNameDeletionPenalty;
|
||||
static const std::wstring AttributeNameInsertionPenalty;
|
||||
static const std::wstring AttributeNameSquashInputs;
|
||||
static const std::wstring AttributeNameSamplesToIgnore;
|
||||
static const std::wstring AttributeNameTokensToIgnore;
|
||||
|
||||
protected:
|
||||
PrimitiveFunction(PrimitiveOpType op, const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& functionName, const std::wstring& uid)
|
||||
|
|
|
@ -72,6 +72,7 @@ namespace CNTK
|
|||
NDCG = 60,
|
||||
EditDistanceError = 61,
|
||||
NoOp = 62,
|
||||
LabelsToGraph = 63
|
||||
// New op types should only be appended to the end of this list.
|
||||
// If you append here, also add checks in SerializationTests (CheckEnumValuesNotModified)
|
||||
// and bump up PrimitiveFunction::s_serializationVersion and update PrimitiveFunction::Deserialize
|
||||
|
|
|
@ -87,6 +87,13 @@ namespace CNTK
|
|||
return result;
|
||||
}
|
||||
|
||||
Variable Variable::NonCompositePreservingCopy() const
|
||||
{
|
||||
Variable copy = *this;
|
||||
copy.m_outputComposite = nullptr;
|
||||
return copy;
|
||||
}
|
||||
|
||||
void Variable::SetOwner(Function* ownerFunction)
|
||||
{
|
||||
if (Kind() != VariableKind::Output)
|
||||
|
|
|
@ -14,7 +14,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
std::atomic<bool> Globals::m_forceConstantRandomSeed(false);
|
||||
|
||||
std::atomic<bool> Globals::m_enableShareNodeValueMatrices(true);
|
||||
std::atomic<bool> Globals::m_enableHyperCompressMemory(false);
|
||||
std::atomic<bool> Globals::m_optimizeGradientAccumulation(true);
|
||||
|
||||
}}}
|
||||
|
|
|
@ -151,6 +151,8 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
std::function<std::string(size_t)> m_getKeyById;
|
||||
|
||||
private:
|
||||
typedef map<std::wstring, Input> MapType;
|
||||
MapType inputs;
|
||||
|
|
|
@ -28,15 +28,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
static void SetShareNodeValueMatrices(bool enable) { m_enableShareNodeValueMatrices = enable; }
|
||||
static bool ShouldEnableShareNodeValueMatrices() { return m_enableShareNodeValueMatrices; }
|
||||
|
||||
static void SetHyperCompressMemory(bool enable) { m_enableHyperCompressMemory = enable; }
|
||||
static bool ShouldEnableHyperCompressMemory() { return m_enableHyperCompressMemory; }
|
||||
|
||||
private:
|
||||
static std::atomic<bool> m_forceDeterministicAlgorithms;
|
||||
// The global flag to enable matrices values in forward and backward prop
|
||||
static std::atomic<bool> m_enableShareNodeValueMatrices;
|
||||
// The global flag to enable hyper memory compression
|
||||
static std::atomic<bool> m_enableHyperCompressMemory;
|
||||
static std::atomic<bool> m_forceConstantRandomSeed;
|
||||
static std::atomic<bool> m_optimizeGradientAccumulation;
|
||||
};
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#if HAS_MPI
|
||||
// Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#ms-mpi or
|
||||
// https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#open-mpi for setup instructions
|
||||
// of an MPI implementation on your platform.
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// Suppress warning for non-ASCII characters in MS-MPI headers
|
||||
#pragma warning(push)
|
||||
|
@ -18,7 +18,25 @@
|
|||
#else
|
||||
#include "mpi.h"
|
||||
#endif
|
||||
#pragma comment(lib, "msmpi.lib")
|
||||
#else
|
||||
// Note: the following macros/typedefs define some of the MPI related functions and constants such that code
|
||||
// using these functionality will compile cleanly - but will not actually perform the MPI operation.
|
||||
// The clean way to go is to move any code related to mpi into the mpiwrapper class implementation and decide
|
||||
// in this class if to use mpi.h or not.
|
||||
typedef void *MPI_Comm;
|
||||
typedef enum _MPI_Datatype { MPI_CHAR, MPI_INT, MPI_FLOAT, MPI_DOUBLE, MPI_UNSIGNED, MPI_LONG_LONG_INT } MPI_Datatype;
|
||||
|
||||
#define MPI_IN_PLACE ((void*)(int)-1)
|
||||
#define MPI_SUM ((MPI_Op)0x58000003)
|
||||
|
||||
#define MPI_STATUSES_IGNORE (MPI_Status*)1
|
||||
#define MPI_STATUS_IGNORE (MPI_Status*)1
|
||||
#define MPI_UNDEFINED (-32766)
|
||||
|
||||
typedef int MPI_Op;
|
||||
typedef int MPI_Request;
|
||||
typedef void *MPI_Status;
|
||||
#endif
|
||||
|
||||
#include <errno.h>
|
||||
#include <string>
|
||||
|
@ -28,8 +46,6 @@
|
|||
|
||||
#include "CommonMatrix.h"
|
||||
|
||||
#define FFLUSH_SUCCESS 0
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
struct MpiFail : public std::string
|
||||
|
@ -40,481 +56,128 @@ struct MpiFail : public std::string
|
|||
}
|
||||
};
|
||||
|
||||
static int operator||(int rc, const MpiFail &what)
|
||||
{
|
||||
if (rc == MPI_SUCCESS)
|
||||
{
|
||||
return rc;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s, MPI error %d\n", what.c_str(), rc);
|
||||
fflush(stderr);
|
||||
|
||||
// (special case: we use that code to indicate a missing msmpi.dll...)
|
||||
if (rc != MPI_ERR_INTERN)
|
||||
{
|
||||
char errbuf[MPI_MAX_ERROR_STRING + 1] = {0};
|
||||
int len;
|
||||
MPI_Error_string(rc, &errbuf[0], &len);
|
||||
fprintf(stderr, "%s, MPI error %d: %s\n", what.c_str(), rc, errbuf);
|
||||
fflush(stderr);
|
||||
|
||||
// we abort through this, so that the MPI system gets the memo
|
||||
MPI_Abort(MPI_COMM_WORLD, rc);
|
||||
|
||||
// TODO: or does that only signal an issue, and we should still terminate ourselves?
|
||||
// BUGBUG: We'd also need to Abort through the other sub-set communicator
|
||||
}
|
||||
RuntimeError("%s", what.c_str());
|
||||
}
|
||||
extern int operator||(int rc, const MpiFail &what);
|
||||
|
||||
class MPIWrapper;
|
||||
typedef std::shared_ptr<MPIWrapper> MPIWrapperPtr;
|
||||
|
||||
extern "C" void GetMpiWrapper(MPIWrapper **mpi);
|
||||
|
||||
// Note: This is now a pure interface, so please don't add
|
||||
// any functionality to this class.
|
||||
// Instead, make your own implementation class, add/change
|
||||
// functions there as needed and use a private interface to
|
||||
// these functions.
|
||||
// In case you need to add functions that affect all
|
||||
// implementations, add a pure virtual function here and
|
||||
// update any affected implementation.
|
||||
class MPIWrapper : public std::enable_shared_from_this<MPIWrapper>
|
||||
{
|
||||
int m_myRank;
|
||||
std::wstring m_myName;
|
||||
int m_numMPINodes;
|
||||
size_t m_numNodesInUse;
|
||||
bool m_multiHost;
|
||||
|
||||
// MPI communicator that reflects the current subset selection
|
||||
MPI_Comm m_currentComm;
|
||||
|
||||
static MPIWrapperPtr s_mpi;
|
||||
|
||||
// MPI_Init() with delay-loading the msmpi.dll (possibly causing a failure if missing; we want to catch that)
|
||||
int MPI_Init_DL()
|
||||
{
|
||||
#ifdef WIN32
|
||||
__try
|
||||
#endif
|
||||
{
|
||||
// don't initialize if that has been done already
|
||||
int flag = 0;
|
||||
MPI_Initialized(&flag);
|
||||
if (flag)
|
||||
return MPI_SUCCESS;
|
||||
|
||||
int argc = 0;
|
||||
char **argv = NULL;
|
||||
// TODO(qiwye) Multiverso(parameter server) will benefit from MPI_THREAD_MULTIPLE .
|
||||
int requiredThreadLevelSupport = MPI_THREAD_SERIALIZED;
|
||||
int provided;
|
||||
int ret = MPI_Init_thread(&argc, &argv, requiredThreadLevelSupport, &provided);
|
||||
if (provided != requiredThreadLevelSupport)
|
||||
LogicError("Failed to initialize MPI with the desired level of thread support");
|
||||
|
||||
return ret;
|
||||
}
|
||||
#ifdef WIN32
|
||||
__except (EXCEPTION_EXECUTE_HANDLER)
|
||||
{
|
||||
fprintf(stderr, "mpihelper: msmpi.dll missing\n");
|
||||
return MPI_ERR_INTERN;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Workaround for the issue with MPI hanging when we have non-0 exit codes from CNTK processes
|
||||
// OpenMPI has a confirmed race condition on killing child process vs. handling their non-zero exit statuses, resulting
|
||||
// in a deadlock, where all processes killed but MPI is still waiting.
|
||||
// This happens when several perfectly synchronized processes (for example on MPI barrier)
|
||||
// simulatenously exit with non-0 exit code.
|
||||
// As a workaround, we simply sleep 50*rank miliseconds, effectively "de-synchronizing processes" at exit,
|
||||
// allowing MPI to sequentially handle terminations
|
||||
static int s_myRank;
|
||||
static void MPIWorkaroundAtExit()
|
||||
{
|
||||
Sleep(s_myRank * 50);
|
||||
}
|
||||
|
||||
public:
|
||||
MPIWrapper()
|
||||
: m_currentComm(MPI_COMM_WORLD)
|
||||
{
|
||||
static bool initialized = false;
|
||||
if (initialized)
|
||||
{
|
||||
LogicError("MPIWrapper: this is a singleton class that can only be instantiated once per process");
|
||||
}
|
||||
MPIWrapper() {}
|
||||
virtual ~MPIWrapper() {}
|
||||
|
||||
initialized = true;
|
||||
|
||||
if (GetMathLibTraceLevel() > 0)
|
||||
{
|
||||
fprintf(stderr, "MPIWrapper: initializing MPI\n");
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
MPI_Init_DL() || MpiFail("mpiaggregator: MPI_Init");
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &m_myRank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &m_numMPINodes);
|
||||
m_numNodesInUse = m_numMPINodes;
|
||||
m_multiHost = true;
|
||||
|
||||
// Verify that the environment variable used by GetTotalNumberOfMPINodes()
|
||||
// matches what the MPI API says. There're actually two possible cases:
|
||||
// 1) when we're running with mpiexec both values have to match;
|
||||
// 2) when we're running without mpiexec, the former will return 0, and
|
||||
// the later will be set to 1.
|
||||
assert((GetTotalNumberOfMPINodes() == 0 && m_numNodesInUse == 1) ||
|
||||
(GetTotalNumberOfMPINodes() == m_numNodesInUse));
|
||||
|
||||
char name[BUFSIZ];
|
||||
int length;
|
||||
MPI_Get_processor_name(name, &length);
|
||||
m_myName = std::wstring(name, name+length);
|
||||
|
||||
// Applying MPI workaround
|
||||
s_myRank = m_myRank;
|
||||
atexit(&MPIWrapper::MPIWorkaroundAtExit);
|
||||
|
||||
// by default we use all of them
|
||||
RequestNodes("MPIWrapper");
|
||||
|
||||
if (GetMathLibTraceLevel() > 0)
|
||||
{
|
||||
if (m_numMPINodes > 1)
|
||||
fprintf(stderr, "mpihelper: we are cog %d in a gearbox of %d\n", (int) m_myRank, (int) m_numMPINodes);
|
||||
else
|
||||
fprintf(stderr, "mpihelper: only one MPI process: MPI operation will be boring\n");
|
||||
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
// do an initial handshake
|
||||
Ping("mpihelper");
|
||||
|
||||
// stagger the jobs just a little to get a sort-of deterministic order e.g. in GPU allocation when running on one machine
|
||||
// continue 0.5 seconds apart
|
||||
::Sleep((DWORD)(500 * CurrentNodeRank()));
|
||||
}
|
||||
static MPIWrapperPtr GetInstance(bool create = false);
|
||||
static void DeleteInstance();
|
||||
static MPIWrapperPtr s_mpi;
|
||||
|
||||
// Note that specifically, this function is such that it does not require
|
||||
// MPI initialization. Moreover, it can be used without actually loading any
|
||||
// MPI libs.
|
||||
// TODO: Once we move to dynamic loading for MPI libs on Linux, move it to utilities.
|
||||
static int GetTotalNumberOfMPINodes()
|
||||
{
|
||||
#ifdef WIN32
|
||||
const char* p = std::getenv("PMI_SIZE");
|
||||
#else
|
||||
const char* p = std::getenv("OMPI_COMM_WORLD_SIZE");
|
||||
#endif
|
||||
if (!p)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
return std::stoi(string(p));
|
||||
}
|
||||
}
|
||||
static int GetTotalNumberOfMPINodes();
|
||||
|
||||
// Note: we don't clear the sub-communication here although we should, because in case of a crash, this prevents the EXE from terminating.
|
||||
// It's OK since this class is a singleton anyway that gets instantiated exactly once at program startup.
|
||||
~MPIWrapper()
|
||||
{
|
||||
if (GetMathLibTraceLevel() > 0)
|
||||
{
|
||||
fprintf(stderr, "~MPIWrapper\n");
|
||||
}
|
||||
|
||||
// Do not finalize in event of an exception since calling MPI_Finalize without
|
||||
// all pending communications being finished results in a hang
|
||||
int rc = fflush(stderr);
|
||||
if (!std::uncaught_exception())
|
||||
{
|
||||
if (rc != FFLUSH_SUCCESS)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
RuntimeError("MPIWrapper: Failed to flush stderr, %d", ::GetLastError());
|
||||
#else
|
||||
RuntimeError("MPIWrapper: Failed to flush stderr, %d", errno);
|
||||
#endif
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void Ping(const char *msg) const
|
||||
{
|
||||
#undef USE2NDCOMM
|
||||
#ifndef USE2NDCOMM
|
||||
if (NumNodesInUse() != m_numMPINodes)
|
||||
{
|
||||
fprintf(stderr, "ping [%s]: cannot be applied to subset (%d) of nodes, skipping\n", msg, (int) NumNodesInUse());
|
||||
fflush(stderr);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
std::array<int, 1> handshake;
|
||||
handshake[0] = 1;
|
||||
|
||||
if (GetMathLibTraceLevel() > 0)
|
||||
{
|
||||
fprintf(stderr, "ping [%s]: %d nodes pinging each other\n", msg, (int) NumNodesInUse());
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
AllReduce(handshake);
|
||||
|
||||
if (GetMathLibTraceLevel() > 0)
|
||||
{
|
||||
fprintf(stderr, "ping [%s]: all %d nodes responded\n", msg, handshake[0]);
|
||||
fflush(stderr);
|
||||
}
|
||||
}
|
||||
|
||||
void RequestNodes(const char *msg, size_t requestednodes = SIZE_MAX /*default: all*/)
|
||||
{
|
||||
Ping("requestnodes (before change)");
|
||||
|
||||
// undo current split
|
||||
#ifdef USE2NDCOMM
|
||||
if (m_currentComm != MPI_COMM_WORLD /*no subset*/ && m_currentComm != MPI_COMM_NULL /*idle nodes*/)
|
||||
{
|
||||
fprintf(stderr, "requestnodes: MPI_Comm_free %x\n", (int) m_currentComm);
|
||||
fflush(stderr);
|
||||
MPI_Comm_free(&m_currentComm) || MpiFail("requestnodes: MPI_Comm_free"); // will leave MPI_COMM_NULL here
|
||||
}
|
||||
#endif
|
||||
// reset to MPI_COMM_WORLD
|
||||
m_currentComm = MPI_COMM_WORLD;
|
||||
// create a new split (unless all nodes were requested)
|
||||
if (requestednodes < (size_t) m_numMPINodes)
|
||||
{
|
||||
#ifdef USE2NDCOMM
|
||||
fprintf(stderr, "requestnodes: MPI_Comm_split %d\n", (node() < requestednodes) ? 1 : MPI_UNDEFINED);
|
||||
fflush(stderr);
|
||||
MPI_Comm_split(communicator(), (node() < requestednodes) ? 1 : MPI_UNDEFINED, 0, &m_currentComm) || MpiFail("requestnodes: MPI_Comm_split");
|
||||
fprintf(stderr, "requestnodes: MPI_Comm_split -> %x\n", (int) m_currentComm);
|
||||
fflush(stderr);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
// leave m_currentComm as MPI_COMM_WORLD
|
||||
// and clip to #nodes
|
||||
requestednodes = m_numMPINodes;
|
||||
}
|
||||
|
||||
m_numNodesInUse = requestednodes;
|
||||
|
||||
if (GetMathLibTraceLevel() > 0)
|
||||
{
|
||||
fprintf(stderr, "requestnodes [%s]: using %d out of %d MPI nodes (%d requested); we (%d) are %s\n",
|
||||
msg, (int) m_numNodesInUse, (int) m_numMPINodes, (int) requestednodes,
|
||||
(int) CurrentNodeRank(), IsIdle() ? "out (idle)" : "in (participating)");
|
||||
fflush(stderr);
|
||||
}
|
||||
Ping("requestnodes (after change)");
|
||||
|
||||
// If all ranks run on a single host, we can enable optimized communication
|
||||
// paths (e.g. NCCL). To determine if a single machine is being used, we
|
||||
// check that MPI_Get_processor_name matches for all ranks.
|
||||
const int nameMax = MPI_MAX_PROCESSOR_NAME + 1;
|
||||
char myName[nameMax] = {0};
|
||||
int myNameLen = 0;
|
||||
MPI_Get_processor_name(myName, &myNameLen) || MpiFail("requestnodes: MPI_Get_processor_name");
|
||||
myName[myNameLen] = '\0';
|
||||
|
||||
std::vector<char> nameBuffer(m_numNodesInUse * nameMax);
|
||||
char* allNames = nameBuffer.data();
|
||||
MPI_Allgather(myName, nameMax, MPI_CHAR, allNames, nameMax, MPI_CHAR, m_currentComm)
|
||||
|| MpiFail("requestnodes: MPI_Allgather");
|
||||
|
||||
m_multiHost = false;
|
||||
for(size_t i=1; i<m_numNodesInUse; i++)
|
||||
{
|
||||
if (strcmp(allNames, allNames+i*nameMax) != 0)
|
||||
{
|
||||
m_multiHost = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "requestnodes [%s]: using %d out of %d MPI nodes on %s (%d requested); we (%d) are %s\n",
|
||||
msg, (int) m_numNodesInUse, (int) m_numMPINodes, m_multiHost ? "multiple hosts" : "a single host",
|
||||
(int) requestednodes, (int) CurrentNodeRank(), IsIdle() ? "out (idle)" : "in (participating)");
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
static MPIWrapperPtr GetInstance(bool create = false)
|
||||
{
|
||||
if (create)
|
||||
{
|
||||
if (s_mpi != nullptr)
|
||||
LogicError("Creating MPIWrapper instance after a GetInstance call has been already made!");
|
||||
else
|
||||
s_mpi = std::make_shared<MPIWrapper>();
|
||||
}
|
||||
|
||||
return s_mpi;
|
||||
}
|
||||
|
||||
static void DeleteInstance()
|
||||
{
|
||||
s_mpi = nullptr;
|
||||
}
|
||||
|
||||
MPI_Comm Communicator() const
|
||||
{
|
||||
return m_currentComm;
|
||||
}
|
||||
size_t NumNodesInUse() const
|
||||
{
|
||||
return m_numNodesInUse;
|
||||
}
|
||||
size_t CurrentNodeRank() const
|
||||
{
|
||||
return m_myRank;
|
||||
}
|
||||
std::wstring CurrentNodeName() const
|
||||
{
|
||||
return m_myName;
|
||||
}
|
||||
bool IsMainNode() const
|
||||
{
|
||||
return m_myRank == 0;
|
||||
} // we are the chosen one--do extra stuff like saving the model to disk
|
||||
bool IsIdle() const
|
||||
{
|
||||
return CurrentNodeRank() >= NumNodesInUse();
|
||||
} // user had requested to not use this many nodes
|
||||
bool UsingAllNodes() const
|
||||
{
|
||||
return NumNodesInUse() == m_numMPINodes;
|
||||
} // all nodes participate (used to check whether we can use MPI_Allreduce directly)
|
||||
size_t MainNodeRank() const
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool IsMultiHost()
|
||||
{
|
||||
return m_multiHost;
|
||||
}
|
||||
virtual size_t NumNodesInUse() const = 0;
|
||||
virtual size_t CurrentNodeRank() const = 0;
|
||||
virtual bool IsMainNode() const = 0;
|
||||
virtual std::wstring CurrentNodeName() const = 0;
|
||||
virtual bool IsIdle() const = 0;
|
||||
virtual bool UsingAllNodes() const = 0;
|
||||
virtual size_t MainNodeRank() const = 0;
|
||||
virtual bool IsMultiHost() const = 0;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// data-exchange functions (wrappers around MPI functions)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
virtual int Finalize(void) = 0;
|
||||
virtual int Wait(MPI_Request* request, MPI_Status* status) = 0;
|
||||
virtual int Waitany(int count, MPI_Request array_of_requests[], int* index, MPI_Status* status) = 0;
|
||||
virtual int Waitall(int count, MPI_Request array_of_requests[], MPI_Status array_of_statuses[]) = 0;
|
||||
virtual int Isend(const void* buf, int count, MPI_Datatype datatype, int dest, int tag, /*MPI_Comm comm,*/ MPI_Request* request) = 0;
|
||||
virtual int Recv(void* buf, int count, MPI_Datatype datatype, int source, int tag, /*MPI_Comm comm,*/ MPI_Status* status) = 0;
|
||||
virtual int Irecv(void* buf, int count, MPI_Datatype datatype, int source, int tag, /*MPI_Comm comm,*/ MPI_Request* request) = 0;
|
||||
virtual int Iallreduce(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, /*MPI_Comm comm,*/ MPI_Request* request) = 0;
|
||||
virtual int Abort(int errorcode) = 0;
|
||||
virtual int Error_string(int errorcode, char* string, int* resultlen) = 0;
|
||||
|
||||
|
||||
// helpers to determine the MPI_Datatype of a pointer
|
||||
static MPI_Datatype GetDataType(char *)
|
||||
{
|
||||
return MPI_CHAR;
|
||||
}
|
||||
static MPI_Datatype GetDataType(int *)
|
||||
{
|
||||
return MPI_INT;
|
||||
}
|
||||
static MPI_Datatype GetDataType(float *)
|
||||
{
|
||||
return MPI_FLOAT;
|
||||
}
|
||||
static MPI_Datatype GetDataType(double *)
|
||||
{
|
||||
return MPI_DOUBLE;
|
||||
}
|
||||
static MPI_Datatype GetDataType(size_t *)
|
||||
{
|
||||
return sizeof(size_t) == 4 ? MPI_UNSIGNED : MPI_LONG_LONG_INT;
|
||||
}
|
||||
static MPI_Datatype GetDataType(char *);
|
||||
static MPI_Datatype GetDataType(int *);
|
||||
static MPI_Datatype GetDataType(float *);
|
||||
static MPI_Datatype GetDataType(double *);
|
||||
static MPI_Datatype GetDataType(size_t *);
|
||||
|
||||
// allreduce of a vector
|
||||
template <typename VECTORLIKEOBJECT>
|
||||
void AllReduce(VECTORLIKEOBJECT &accumulator) const
|
||||
{
|
||||
auto *dataptr = accumulator.data();
|
||||
size_t totalnumelements = accumulator.size();
|
||||
|
||||
// use MPI to compute the sum over all elements in (dataptr, totalnumelements) and redistribute to all nodes
|
||||
AllReduce<typename VECTORLIKEOBJECT::value_type>(dataptr, totalnumelements);
|
||||
}
|
||||
virtual void AllReduce(std::vector<size_t>& accumulator) const = 0;
|
||||
virtual void AllReduce(std::vector<int>& accumulator) const = 0;
|
||||
virtual void AllReduce(std::vector<double>& accumulator) const = 0;
|
||||
virtual void AllReduce(std::vector<float>& accumulator) const = 0;
|
||||
|
||||
// for raw pointer
|
||||
template <class ElemType>
|
||||
void AllReduce(ElemType* sendData, size_t numElements, MPI_Op op = MPI_SUM) const
|
||||
{
|
||||
AllReduce<ElemType>(static_cast<ElemType*>(MPI_IN_PLACE), sendData, numElements, op);
|
||||
}
|
||||
virtual void AllReduce(size_t* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduce(int* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduce(double* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduce(float* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
|
||||
|
||||
template <class ElemType>
|
||||
void AllReduceAsync(ElemType* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const
|
||||
{
|
||||
AllReduceAsync<ElemType>(static_cast<ElemType*>(MPI_IN_PLACE), sendData, numElements, request, op);
|
||||
}
|
||||
virtual void AllReduce(size_t* sendData, size_t* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduce(int* sendData, int* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduce(double* sendData, double* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduce(float* sendData, float* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
|
||||
|
||||
template <class ElemType>
|
||||
void AllGatherAsync(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, size_t numRecvElements, MPI_Request* request) const
|
||||
{
|
||||
MPI_Iallgather(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, (int)numRecvElements, GetDataType(receiveData), Communicator(), request) || MpiFail("AllReduceAsync: MPI_Iallgather");
|
||||
}
|
||||
virtual void AllReduceAsync(size_t* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduceAsync(int* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduceAsync(double* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduceAsync(float* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
|
||||
|
||||
template <class ElemType>
|
||||
void AllGather(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, size_t numRecvElements) const
|
||||
{
|
||||
MPI_Allgather(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, (int)numRecvElements, GetDataType(receiveData), Communicator()) || MpiFail("AllReduceAsync: MPI_Allgather");
|
||||
}
|
||||
virtual void AllReduceAsync(size_t* sendData, size_t* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduceAsync(int* sendData, int* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduceAsync(double* sendData, double* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
|
||||
virtual void AllReduceAsync(float* sendData, float* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
|
||||
|
||||
template <class ElemType>
|
||||
void AllReduceAsync(ElemType *sendData, ElemType *receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const
|
||||
{
|
||||
MPI_Iallreduce(sendData, receiveData, (int)numElements, GetDataType(sendData), op, Communicator(), request) || MpiFail("AllReduceAsync: MPI_Iallreduce");
|
||||
}
|
||||
virtual void Bcast(size_t* sendData, size_t numElements, size_t srcRank) = 0;
|
||||
virtual void Bcast(double* sendData, size_t numElements, size_t srcRank) = 0;
|
||||
virtual void Bcast(float* sendData, size_t numElements, size_t srcRank) = 0;
|
||||
virtual void Bcast(void* buffer, int count, MPI_Datatype datatype, int root) = 0;
|
||||
|
||||
template <class ElemType>
|
||||
void AllReduce(ElemType *sendData, ElemType *receiveData, size_t numElements, MPI_Op op = MPI_SUM) const
|
||||
{
|
||||
MPI_Allreduce(sendData, receiveData, (int)numElements, GetDataType(sendData), op, Communicator()) || MpiFail("AllReduce: MPI_Allreduce");
|
||||
}
|
||||
virtual void AllGatherAsync(const size_t *sendData, size_t numSendElements, size_t *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;
|
||||
virtual void AllGatherAsync(const int *sendData, size_t numSendElements, int *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;
|
||||
virtual void AllGatherAsync(const float *sendData, size_t numSendElements, float *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;
|
||||
virtual void AllGatherAsync(const double *sendData, size_t numSendElements, double *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;
|
||||
|
||||
template <class ElemType>
|
||||
void Gather(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, size_t numRecvElements, size_t rootRank) const
|
||||
{
|
||||
MPI_Gather(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, (int)numRecvElements, GetDataType(receiveData), (int)rootRank, Communicator()) || MpiFail("AllReduceAsync: MPI_Gather");
|
||||
}
|
||||
virtual void AllGather(const size_t *sendData, size_t numSendElements, size_t *receiveData, size_t numRecvElements) const = 0;
|
||||
virtual void AllGather(const int *sendData, size_t numSendElements, int *receiveData, size_t numRecvElements) const = 0;
|
||||
virtual void AllGather(const float *sendData, size_t numSendElements, float *receiveData, size_t numRecvElements) const = 0;
|
||||
virtual void AllGather(const double *sendData, size_t numSendElements, double *receiveData, size_t numRecvElements) const = 0;
|
||||
virtual void Allgather(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype) const = 0;
|
||||
|
||||
template <class ElemType>
|
||||
void Gatherv(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, int recvCounts[], int offsets[], size_t rootRank) const
|
||||
{
|
||||
MPI_Gatherv(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, recvCounts, offsets, GetDataType(receiveData), (int)rootRank, Communicator()) || MpiFail("AllReduceAsync: MPI_Gatherv");
|
||||
}
|
||||
virtual void Gather(const size_t *sendData, size_t numSendElements, size_t *receiveData, size_t numRecvElements, size_t rootRank) const = 0;
|
||||
virtual void Gather(const int *sendData, size_t numSendElements, int *receiveData, size_t numRecvElements, size_t rootRank) const = 0;
|
||||
virtual void Gather(const float *sendData, size_t numSendElements, float *receiveData, size_t numRecvElements, size_t rootRank) const = 0;
|
||||
virtual void Gather(const double *sendData, size_t numSendElements, double *receiveData, size_t numRecvElements, size_t rootRank) const = 0;
|
||||
|
||||
template <class ElemType>
|
||||
void Bcast(ElemType *pData, size_t nData, size_t srcRank)
|
||||
{
|
||||
MPI_Bcast(pData, (int) nData, GetDataType(pData), (int) srcRank, Communicator()) || MpiFail("Bcast: MPI_Bcast");
|
||||
}
|
||||
|
||||
// wait for an async request to finish
|
||||
void Wait(MPI_Request* request)
|
||||
{
|
||||
MPI_Wait(request, MPI_STATUSES_IGNORE) || MpiFail("Wait: MPI_Wait");
|
||||
}
|
||||
|
||||
void WaitAny(MPI_Request* requests, int numRequests, int* index)
|
||||
{
|
||||
MPI_Waitany(numRequests, requests, index, MPI_STATUSES_IGNORE) || MpiFail("WaitAny: MPI_Waitany");
|
||||
}
|
||||
virtual void Gatherv(const size_t *sendData, size_t numSendElements, size_t *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
|
||||
virtual void Gatherv(const char *sendData, size_t numSendElements, char *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
|
||||
virtual void Gatherv(const int *sendData, size_t numSendElements, int *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
|
||||
virtual void Gatherv(const float *sendData, size_t numSendElements, float *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
|
||||
virtual void Gatherv(const double *sendData, size_t numSendElements, double *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
|
||||
|
||||
// wait for all ranks to reach here
|
||||
void WaitAll()
|
||||
{
|
||||
MPI_Barrier(m_currentComm) || MpiFail("waitall: MPI_Barrier");
|
||||
}
|
||||
|
||||
void WaitAll(std::vector<MPI_Request>& requests)
|
||||
{
|
||||
MPI_Waitall((int)requests.size(), &requests[0], MPI_STATUSES_IGNORE) || MpiFail("waitall: MPI_Waitall");
|
||||
}
|
||||
virtual int WaitAll() = 0;
|
||||
virtual void WaitAny(MPI_Request* requests, int numRequests, int* index) = 0;
|
||||
virtual void Wait(MPI_Request* request) = 0;
|
||||
virtual int WaitAll(std::vector<MPI_Request>& requests) = 0;
|
||||
};
|
||||
|
||||
}}}
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -36,15 +36,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// -----------------------------------------------------------------------
|
||||
|
||||
template <>
|
||||
vector<shared_ptr<Matrix<float>>>& MatrixPool::GetReleasedMatrices<float>()
|
||||
vector<MemRequestInfo<float>>& MatrixPool::GetMemRequestInfoVec<float>()
|
||||
{
|
||||
return m_releasedFloatMatrices;
|
||||
return m_memRequestInfoFloatVec;
|
||||
}
|
||||
|
||||
template <>
|
||||
vector<shared_ptr<Matrix<double>>>& MatrixPool::GetReleasedMatrices<double>()
|
||||
vector<MemRequestInfo<double>>& MatrixPool::GetMemRequestInfoVec<double>()
|
||||
{
|
||||
return m_releasedDoubleMatrices;
|
||||
return m_memRequestInfoDoubleVec;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -463,7 +463,7 @@ bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
|
|||
nodePtr->OperationName() == OperationNameOf(CrossEntropyNode) ||
|
||||
nodePtr->OperationName() == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode) ||
|
||||
nodePtr->OperationName() == OperationNameOf(ClassificationErrorNode) ||
|
||||
nodePtr->OperationName() == OperationNameOf(EditDistanceErrorNode) ||
|
||||
nodePtr->OperationName() == OperationNameOf(ForwardBackwardNode) ||
|
||||
#ifdef COMING_SOON
|
||||
nodePtr->OperationName() == OperationNameOf(CRFNode) ||
|
||||
#endif
|
||||
|
|
|
@ -49,6 +49,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
|
|||
else if (nodeType == OperationNameOf(CropNode)) return New<CropNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(CrossEntropyNode)) return New<CrossEntropyNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(CrossEntropyWithSoftmaxNode)) return New<CrossEntropyWithSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(ForwardBackwardNode)) return New<ForwardBackwardNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(DiagonalNode)) return New<DiagonalNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(DiagTimesNode)) return New<DiagTimesNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(DropoutNode)) return New<DropoutNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
|
@ -93,6 +94,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
|
|||
else if (nodeType == OperationNameOf(PerDimMeanVarNormalizationNode)) return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(PerDimMeanVarDeNormalizationNode)) return New<PerDimMeanVarDeNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(PassNode)) return New<PassNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(LabelsToGraphNode)) return New<LabelsToGraphNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(PlusNode)) return New<PlusNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(RandomSampleNode)) return New<RandomSampleNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(RandomSampleInclusionFrequencyNode)) return New<RandomSampleInclusionFrequencyNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
|
@ -430,9 +432,9 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Class
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> samplesToIgnore, const std::wstring nodeName)
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> tokensToIgnore, const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<EditDistanceErrorNode<ElemType>>(net.GetDeviceId(), subPen, delPen, insPen, squashInputs, samplesToIgnore, nodeName), { a, b });
|
||||
return net.AddNodeToNetAndAttachInputs(New<EditDistanceErrorNode<ElemType>>(net.GetDeviceId(), nodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore), { a, b });
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -499,6 +501,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Seque
|
|||
return net.AddNodeToNetAndAttachInputs(New<SequenceWithSoftmaxNode<ElemType>>(net.GetDeviceId(), nodeName), { label, prediction, loglikelihood });
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ForwardBackward(const ComputationNodePtr label, const ComputationNodePtr prediction, int blankTokenId, int delayConstraint, const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<ForwardBackwardNode<ElemType>>(net.GetDeviceId(), nodeName, blankTokenId, delayConstraint), { label, prediction });
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NoiseContrastiveEstimation(const ComputationNodePtr label, const ComputationNodePtr prediction,
|
||||
const ComputationNodePtr input_weight,
|
||||
|
@ -570,6 +578,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Pass(
|
|||
return net.AddNodeToNetAndAttachInputs(New<PassNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LabelsToGraph(const ComputationNodePtr a, const std::wstring& nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<LabelsToGraphNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::DynamicAxis(const ComputationNodePtr a, const std::wstring& nodeName)
|
||||
{
|
||||
|
|
|
@ -126,11 +126,12 @@ public:
|
|||
ComputationNodePtr CosDistance(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr CrossEntropy(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr CrossEntropyWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr ForwardBackward(const ComputationNodePtr label, const ComputationNodePtr prediction, int blankTokenId, int delayConstraint, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr DiagTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Diagonal(const ComputationNodePtr a, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Dropout(const ComputationNodePtr a, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr DummyCriterion(const ComputationNodePtr objectives, const ComputationNodePtr derivatives, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> samplesToIgnore, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> tokensToIgnore, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr ElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr DynamicAxis(const ComputationNodePtr a, const std::wstring& nodeName = L"");
|
||||
ComputationNodePtr ClassificationError(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
|
@ -159,6 +160,7 @@ public:
|
|||
ComputationNodePtr Negate(const ComputationNodePtr a, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr NoiseContrastiveEstimation(const ComputationNodePtr label, const ComputationNodePtr prediction, const ComputationNodePtr input_weight, const ComputationNodePtr input_bias, const std::wstring nodeName = L"", NCEEvalMode mode = NCEEvalMode::None);
|
||||
ComputationNodePtr Pass(const ComputationNodePtr a, const std::wstring& nodeName = L"");
|
||||
ComputationNodePtr LabelsToGraph(const ComputationNodePtr a, const std::wstring& nodeName = L"");
|
||||
ComputationNodePtr PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr PerDimMeanVarDeNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr PerDimMeanVarNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");
|
||||
|
|
|
@ -943,31 +943,41 @@ void ComputationNetwork::PrintMemorySharingStructure(const vector<ComputationNod
|
|||
size_t numUnshared = 0;
|
||||
for (const auto& item : memSharingStructure)
|
||||
{
|
||||
if (item.second.size() < 2) // only print actually shared matrices
|
||||
if (item.second.size() < 2) // unshared matrices
|
||||
numUnshared++;
|
||||
else
|
||||
else // shared matrices
|
||||
numShared++;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\nMemory Sharing: Out of %d matrices, %d are shared as %d, and %d are not shared.\n\n", (int)numMatrices, (int)(numMatrices - numUnshared), (int)numShared, (int)numUnshared);
|
||||
fprintf(stderr, "\nMemory Sharing: Out of %d matrices, %d are shared as %d, and %d are not shared.\n", (int)numMatrices, (int)(numMatrices - numUnshared), (int)numShared, (int)numUnshared);
|
||||
|
||||
fprintf(stderr, "\nHere are the ones that share memory:\n");
|
||||
for (const auto& item : memSharingStructure)
|
||||
{
|
||||
if (item.second.size() < 2) // only print actually shared matrices
|
||||
continue;
|
||||
// Format:
|
||||
// { node1
|
||||
// node2 }
|
||||
// { node3
|
||||
// node4
|
||||
// node5 }
|
||||
// where unshared nodes are not printed.
|
||||
const char* delim = "\t{ ";
|
||||
for (const auto& memShareInfo : item.second)
|
||||
if (item.second.size() >= 2)
|
||||
{
|
||||
fprintf(stderr, "%s%ls", delim, memShareInfo.c_str());
|
||||
delim = "\n\t ";
|
||||
// Format:
|
||||
// { node1
|
||||
// node2 }
|
||||
// { node3
|
||||
// node4
|
||||
// node5 }
|
||||
const char* delim = "\t{ ";
|
||||
for (const auto& memShareInfo : item.second)
|
||||
{
|
||||
fprintf(stderr, "%s%ls", delim, memShareInfo.c_str());
|
||||
delim = "\n\t ";
|
||||
}
|
||||
fprintf(stderr, " }\n");
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "\nHere are the ones that don't share memory:\n");
|
||||
for (const auto& item : memSharingStructure)
|
||||
{
|
||||
if (item.second.size() < 2)
|
||||
{
|
||||
fprintf(stderr, "\t{%ls}\n", item.second.begin()->c_str());
|
||||
}
|
||||
fprintf(stderr, " }\n");
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
@ -1003,7 +1013,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
|
|||
// Due to special topology, if a node is solely induced by parameters, its function value should not be shared
|
||||
MarkValueNonSharableNodes();
|
||||
|
||||
bool performingBackPropagation = (trainRootNode != nullptr) || (Globals::ShouldEnableHyperCompressMemory());
|
||||
bool performingBackPropagation = (trainRootNode != nullptr);
|
||||
|
||||
// Construct the composite forward prop eval order by enumerating the
|
||||
// nodes corresponding to each of our roots in global eval oder
|
||||
|
@ -1062,6 +1072,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
|
|||
}
|
||||
}
|
||||
|
||||
m_matrixPool.ResetStepCounter();
|
||||
set<ComputationNodeBasePtr> completedEvaluate;
|
||||
for (auto& nodeIter : compositeForwardPropEvalOrder)
|
||||
{
|
||||
|
@ -1127,8 +1138,16 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
|
|||
}
|
||||
}
|
||||
|
||||
m_matrixPool.OptimizedMemoryAllocation();
|
||||
m_areMatricesAllocated = true;
|
||||
|
||||
// TO DO: At the time of AllocateAllMatrices we don't know the minibatch size. In theory one may allocate memory again once we start to receive
|
||||
// data from the reader (and the minibatch size is known). For some problems, minibatch size can change constantly, and there needs to be a
|
||||
// tradeoff in deciding how frequent to run optimized memory allocation. For now, we do it only once at the very beginning for speed concerns.
|
||||
|
||||
// TO DO: when some matrices are sparse, the memory size request may be wrong. One may need to call OptimizedMemoryAllocation later again
|
||||
// if the requests of sparse allocation and release are re-processed correctly. Future work.
|
||||
|
||||
// print the memory sharing structure
|
||||
if (TraceLevel() > 0)
|
||||
PrintMemorySharingStructure(GetAllNodes());
|
||||
|
|
|
@ -626,14 +626,16 @@ template <class ElemType>
|
|||
// 'transpose' means print one row per sample (non-transposed is one column per sample).
|
||||
// 'isSparse' will print all non-zero values as one row (non-transposed, which makes sense for one-hot) or column (transposed).
|
||||
template <class ElemType>
|
||||
void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const FrameRange& fr,
|
||||
void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f,
|
||||
const FrameRange& fr,
|
||||
size_t onlyUpToRow, size_t onlyUpToT, bool transpose, bool isCategoryLabel, bool isSparse,
|
||||
const vector<string>& labelMapping, const string& sequenceSeparator,
|
||||
const string& sequencePrologue, const string& sequenceEpilogue,
|
||||
const string& elementSeparator, const string& sampleSeparator,
|
||||
string valueFormatString,
|
||||
bool outputGradient,
|
||||
bool onlyShowAbsSumForDense) const
|
||||
bool onlyShowAbsSumForDense,
|
||||
std::function<std::string(size_t)> getKeyById) const
|
||||
{
|
||||
// get minibatch matrix -> matData, matRows, matStride
|
||||
const Matrix<ElemType>& outputValues = outputGradient ? Gradient() : Value();
|
||||
|
@ -716,6 +718,8 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
|
|||
|
||||
if (s > 0)
|
||||
fprintfOrDie(f, "%s", sequenceSeparator.c_str());
|
||||
if (getKeyById)
|
||||
fprintfOrDie(f, "%s ", getKeyById(seqInfo.seqId).c_str());
|
||||
fprintfOrDie(f, "%s", seqProl.c_str());
|
||||
|
||||
// output it according to our format specification
|
||||
|
|
|
@ -791,8 +791,7 @@ public:
|
|||
void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
|
||||
bool IsOutputNeededDuringBackprop() const
|
||||
{
|
||||
return (!Globals::ShouldEnableShareNodeValueMatrices() && !Globals::ShouldEnableHyperCompressMemory())
|
||||
|| m_outputNeededDuringBackprop;
|
||||
return !Globals::ShouldEnableShareNodeValueMatrices() || m_outputNeededDuringBackprop;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -1680,20 +1679,6 @@ public:
|
|||
#endif
|
||||
// tracing
|
||||
Trace();
|
||||
|
||||
// Any memory not needed could resize to zero immediately when HyperCompressMemory active. Since the memory won't really release,
|
||||
// all these memory blocks are gathered into a memory pool. When the next request coming, the best fitting block will be chosen.
|
||||
if (Globals::ShouldEnableHyperCompressMemory())
|
||||
{
|
||||
for (auto& input : GetInputs())
|
||||
{
|
||||
if (!input->IsOutputNeededDuringBackprop() && input->IsValueSharable())
|
||||
{
|
||||
auto inputNodePtr = DownCast(input);
|
||||
inputNodePtr->Value().Resize(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual void /*IComputationNode::*/BeginBackprop() override
|
||||
|
@ -1728,9 +1713,9 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef _DEBUG
|
||||
virtual void /*IComputationNode::*/ EndBackprop() override
|
||||
{
|
||||
#ifdef _DEBUG
|
||||
Base::EndBackprop();
|
||||
#ifdef TRACK_GAP_NANS
|
||||
for (size_t i = 0; i < m_inputs.size(); i++)
|
||||
|
@ -1744,18 +1729,8 @@ public:
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
// We could release the gradient of value sharable nodes and all no-longer used memory generated in forward.
|
||||
if (IsValueSharable() && Globals::ShouldEnableHyperCompressMemory())
|
||||
{
|
||||
if (GradientPtr())
|
||||
Gradient().Resize(0, 0);
|
||||
|
||||
// canceling the graph dependency
|
||||
if (IsOutputNeededDuringBackprop())
|
||||
Value().Resize(0, 0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// this is the entry point from Network; while it will call virtual BackpropTo() into the actual node implementation
|
||||
// TODO: move to -Base (or -Network?)
|
||||
|
@ -1816,10 +1791,12 @@ public:
|
|||
}
|
||||
|
||||
// request matrices needed to do node function value evaluation
|
||||
// for memory pool utilization optimizaiton, the requested pointer is not immediately useable until the entire network has gone through all requests
|
||||
virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
|
||||
{
|
||||
size_t matrixSize = m_sampleLayout.GetNumElements();
|
||||
if (IsValueSharable())
|
||||
RequestMatrixFromPool(m_value, matrixPool);
|
||||
RequestMatrixFromPool(m_value, matrixPool, matrixSize, HasMBLayout());
|
||||
else
|
||||
CreateMatrixIfNull(m_value);
|
||||
}
|
||||
|
@ -1844,7 +1821,8 @@ public:
|
|||
// request matrices that are needed for gradient computation
|
||||
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
RequestMatrixFromPool(m_gradient, matrixPool);
|
||||
size_t matrixSize = m_sampleLayout.GetNumElements();
|
||||
RequestMatrixFromPool(m_gradient, matrixPool, matrixSize, HasMBLayout());
|
||||
}
|
||||
|
||||
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
||||
|
@ -1889,18 +1867,20 @@ protected:
|
|||
matrixPtr = make_shared<Matrix<ElemType>>(m_deviceId);
|
||||
}
|
||||
|
||||
void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool)
|
||||
// matrixSize is per sample size, if unknown or hard to estimate, set matrixSize = 0
|
||||
// if the matrix's size will scale with minibatch size, set mbScale = true
|
||||
void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize=0, bool mbScale=false)
|
||||
{
|
||||
if (matrixPtr == nullptr)
|
||||
{
|
||||
matrixPtr = matrixPool.Request<ElemType>(m_deviceId);
|
||||
matrixPool.RequestAllocate<ElemType>(m_deviceId, &matrixPtr, matrixSize, mbScale);
|
||||
}
|
||||
}
|
||||
|
||||
void ReleaseMatrixToPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool)
|
||||
{
|
||||
assert(matrixPtr != nullptr);
|
||||
matrixPool.Release<ElemType>(matrixPtr);
|
||||
matrixPool.RequestRelease<ElemType>(&matrixPtr);
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -1915,7 +1895,8 @@ public:
|
|||
const std::vector<std::string>& labelMapping, const std::string& sequenceSeparator,
|
||||
const std::string& sequencePrologue, const std::string& sequenceEpilogue, const std::string& elementSeparator,
|
||||
const std::string& sampleSeparator, std::string valueFormatString,
|
||||
bool outputGradient = false, bool onlyShowAbsSumForDense = false) const;
|
||||
bool outputGradient = false, bool onlyShowAbsSumForDense = false,
|
||||
std::function<std::string(size_t)> getKeyById = std::function<std::string(size_t)>()) const;
|
||||
|
||||
// simple helper to log the content of a minibatch
|
||||
void DebugLogMinibatch(bool outputGradient = false) const
|
||||
|
|
|
@ -220,7 +220,8 @@ protected:
|
|||
ImageLayoutKind m_imageLayout;
|
||||
|
||||
size_t m_maxTempMemSizeInSamples;
|
||||
shared_ptr<Matrix<ElemType>> m_tempMatrix;
|
||||
shared_ptr<Matrix<ElemType>> m_tempMatrixForward;
|
||||
shared_ptr<Matrix<ElemType>> m_tempMatrixBackward;
|
||||
|
||||
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
|
||||
};
|
||||
|
@ -239,7 +240,8 @@ protected: \
|
|||
using Base::m_transpose; \
|
||||
using Base::m_imageLayout; \
|
||||
using Base::m_maxTempMemSizeInSamples; \
|
||||
using Base::m_tempMatrix; \
|
||||
using Base::m_tempMatrixForward; \
|
||||
using Base::m_tempMatrixBackward; \
|
||||
using Base::m_convEng; \
|
||||
using Base::InferReductionDims; \
|
||||
public:
|
||||
|
@ -351,13 +353,13 @@ public:
|
|||
const Matrix<ElemType>& input0 = InputRef(0).ValueAsMatrix();
|
||||
Matrix<ElemType> sliceInput1Value = InputRef(1).ValueFor(fr);
|
||||
if (!m_transpose)
|
||||
m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
|
||||
m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrixForward);
|
||||
else
|
||||
{
|
||||
// BackwardData adds results to the output so need to zero them out first.
|
||||
// REVIEW alexeyk: should be rolled into BackwardData itself.
|
||||
sliceOutputValue.SetValue(0);
|
||||
m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, /*accumulateGradient =*/ true, *m_tempMatrix);
|
||||
m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, /*accumulateGradient =*/ true, *m_tempMatrixForward);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -369,20 +371,20 @@ public:
|
|||
auto& grad = InputRef(0).GradientAsMatrix();
|
||||
auto sliceInput1Value = InputRef(1).ValueFor(fr);
|
||||
if (!m_transpose)
|
||||
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
|
||||
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrixBackward);
|
||||
else
|
||||
m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
|
||||
m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrixBackward);
|
||||
}
|
||||
else if (inputIndex == 1) // derivative with respect to the input feature
|
||||
{
|
||||
auto& input0 = InputRef(0).ValueAsMatrix();
|
||||
auto sliceInput1Grad = InputRef(1).GradientFor(fr);
|
||||
if (!m_transpose)
|
||||
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, !Input(inputIndex)->ParentOverwritesGradient(), *m_tempMatrix);
|
||||
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, !Input(inputIndex)->ParentOverwritesGradient(), *m_tempMatrixBackward);
|
||||
else
|
||||
{
|
||||
// REVIEW alexeyk: Forward overwrites values in sliceInput1Grad. Should handle correctly instead.
|
||||
m_convEng->Forward(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
|
||||
m_convEng->Forward(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrixBackward);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -500,25 +502,26 @@ public:
|
|||
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
||||
RequestMatrixFromPool(m_tempMatrix, matrixPool);
|
||||
RequestMatrixFromPool(m_tempMatrixForward, matrixPool);
|
||||
}
|
||||
|
||||
//void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) override
|
||||
//{
|
||||
// Base::ReleaseMatricesAfterForwardProp(matrixPool);
|
||||
// ReleaseMatrixToPool(m_tempMatrix, matrixPool);
|
||||
//}
|
||||
// m_tempMatrixForward is only used as workspace for convolution, we can release it immediately afterwards
|
||||
void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::ReleaseMatricesAfterForwardProp(matrixPool);
|
||||
ReleaseMatrixToPool(m_tempMatrixForward, matrixPool);
|
||||
}
|
||||
|
||||
//void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
|
||||
//{
|
||||
// Base::RequestMatricesBeforeBackprop(matrixPool);
|
||||
// RequestMatrixFromPool(m_tempMatrix, matrixPool);
|
||||
//}
|
||||
void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::RequestMatricesBeforeBackprop(matrixPool);
|
||||
RequestMatrixFromPool(m_tempMatrixBackward, matrixPool);
|
||||
}
|
||||
|
||||
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
|
||||
ReleaseMatrixToPool(m_tempMatrixBackward, matrixPool);
|
||||
}
|
||||
|
||||
void SetmMaxTempMemSizeInSamples(const size_t maxTempMemSizeInSamples)
|
||||
|
@ -530,6 +533,8 @@ public:
|
|||
|
||||
bool IsConvolution2D() const { return m_convolution2D; }
|
||||
|
||||
bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
||||
|
||||
private:
|
||||
using TransformerNode::m_transforms;
|
||||
using ConvolutionNodeBase<ElemType>::ComputeFilterTransform;
|
||||
|
@ -600,9 +605,12 @@ public:
|
|||
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
||||
RequestMatrixFromPool(m_tempMatrix, matrixPool);
|
||||
size_t matrixSize = m_sampleLayout.GetNumElements();
|
||||
RequestMatrixFromPool(m_tempMatrix, matrixPool, matrixSize, true);
|
||||
}
|
||||
|
||||
// m_tempMatrix cannot be released after Forward Prop because its content (argmax) is used for back prop.
|
||||
|
||||
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
|
|
|
@ -461,7 +461,7 @@ template class NDCG1EvalNode<double>;
|
|||
// Edit distance error evaluation node with the option of specifying penalty of substitution, deletion and insertion, as well as squashing the input sequences and ignoring certain samples.
|
||||
// Using the classic DP algorithm as described in https://en.wikipedia.org/wiki/Edit_distance, adjusted to take into account the penalties.
|
||||
//
|
||||
// The node allows to squash sequences of repeating labels and ignore certain labels. For example, if squashInputs is true and samplesToIgnore contains label '-' then
|
||||
// The node allows to squash sequences of repeating labels and ignore certain labels. For example, if squashInputs is true and tokensToIgnore contains label '-' then
|
||||
// given first input sequence as s1="a-ab-" and second as s2="-aa--abb" the edit distance will be computed against s1' = "aab" and s2' = "aab".
|
||||
//
|
||||
// The returned error is computed as: EditDistance(s1,s2) * length(s1') / length(s1)
|
||||
|
@ -480,21 +480,17 @@ public:
|
|||
// delPen - deletion penalty
|
||||
// insPen - insertion penalty
|
||||
// squashInputs - whether to merge sequences of identical samples.
|
||||
// samplesToIgnore - list of samples to ignore during edit distance evaluation
|
||||
EditDistanceErrorNode(DEVICEID_TYPE deviceId, float subPen, float delPen, float insPen, bool squashInputs, std::vector<size_t> samplesToIgnore, const wstring & name)
|
||||
: Base(deviceId, name), m_subPen(subPen), m_delPen(delPen), m_insPen(insPen), m_squashInputs(squashInputs), m_SamplesToIgnore(samplesToIgnore)
|
||||
// tokensToIgnore - list of samples to ignore during edit distance evaluation
|
||||
EditDistanceErrorNode(DEVICEID_TYPE deviceId, const wstring & name, float subPen = 0.0f, float delPen = 0.0f, float insPen = 0.0f, bool squashInputs = false, vector<size_t> tokensToIgnore = {})
|
||||
: Base(deviceId, name), m_SubPen(subPen), m_DelPen(delPen), m_InsPen(insPen), m_SquashInputs(squashInputs), m_tokensToIgnore(tokensToIgnore)
|
||||
{
|
||||
}
|
||||
|
||||
EditDistanceErrorNode(const ScriptableObjects::IConfigRecordPtr configp)
|
||||
: EditDistanceErrorNode(configp->Get(L"deviceId"), configp->Get(L"subPen"), configp->Get(L"delPen"), configp->Get(L"insPen"), configp->Get(L"squashInputs"), configp->Get(L"samplesToIgnore"), L"<placeholder>")
|
||||
: EditDistanceErrorNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"subPen"), configp->Get(L"delPen"), configp->Get(L"insPen"), configp->Get(L"squashInputs"), {})
|
||||
{
|
||||
AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
|
||||
}
|
||||
|
||||
EditDistanceErrorNode(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: Base(deviceId, name)
|
||||
{
|
||||
m_tokensToIgnore = ScriptableObjects::ConfigArray::FlattenedVectorFrom<size_t>(configp->Get(L"tokensToIgnore"));
|
||||
}
|
||||
|
||||
virtual void BackpropToNonLooping(size_t /*inputIndex*/) override
|
||||
|
@ -515,7 +511,7 @@ public:
|
|||
|
||||
MaskMissingColumnsToZero(*m_maxIndexes0, Input(0)->GetMBLayout(), frameRange);
|
||||
MaskMissingColumnsToZero(*m_maxIndexes1, Input(1)->GetMBLayout(), frameRange);
|
||||
Value()(0, 0) = ComputeEditDistanceError(*m_maxIndexes0, *m_maxIndexes1, Input(0)->GetMBLayout(), m_subPen, m_delPen, m_insPen, m_squashInputs, m_SamplesToIgnore);
|
||||
Value()(0, 0) = ComputeEditDistanceError(*m_maxIndexes0, *m_maxIndexes1, Input(0)->GetMBLayout(), m_SubPen, m_DelPen, m_InsPen, m_SquashInputs, m_tokensToIgnore);
|
||||
}
|
||||
|
||||
virtual void Validate(bool isFinalValidationPass) override
|
||||
|
@ -544,11 +540,11 @@ public:
|
|||
node->m_maxIndexes0 = m_maxIndexes0;
|
||||
node->m_maxIndexes1 = m_maxIndexes1;
|
||||
node->m_maxValues = m_maxValues;
|
||||
node->m_squashInputs = m_squashInputs;
|
||||
node->m_subPen = m_subPen;
|
||||
node->m_delPen = m_delPen;
|
||||
node->m_insPen = m_insPen;
|
||||
node->m_SamplesToIgnore = m_SamplesToIgnore;
|
||||
node->m_SquashInputs = m_SquashInputs;
|
||||
node->m_SubPen = m_SubPen;
|
||||
node->m_DelPen = m_DelPen;
|
||||
node->m_InsPen = m_InsPen;
|
||||
node->m_tokensToIgnore = m_tokensToIgnore;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -578,9 +574,9 @@ public:
|
|||
// delPen - deletion penalty
|
||||
// insPen - insertion penalty
|
||||
// squashInputs - whether to merge sequences of identical samples.
|
||||
// samplesToIgnore - list of samples to ignore during edit distance evaluation
|
||||
// tokensToIgnore - list of samples to ignore during edit distance evaluation
|
||||
static ElemType ComputeEditDistanceError(Matrix<ElemType>& firstSeq, const Matrix<ElemType> & secondSeq, MBLayoutPtr pMBLayout,
|
||||
float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& samplesToIgnore)
|
||||
float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& tokensToIgnore)
|
||||
{
|
||||
std::vector<int> firstSeqVec, secondSeqVec;
|
||||
|
||||
|
@ -614,8 +610,8 @@ public:
|
|||
|
||||
auto columnIndices = pMBLayout->GetColumnIndices(sequence);
|
||||
|
||||
ExtractSampleSequence(firstSeq, columnIndices, squashInputs, samplesToIgnore, firstSeqVec);
|
||||
ExtractSampleSequence(secondSeq, columnIndices, squashInputs, samplesToIgnore, secondSeqVec);
|
||||
ExtractSampleSequence(firstSeq, columnIndices, squashInputs, tokensToIgnore, firstSeqVec);
|
||||
ExtractSampleSequence(secondSeq, columnIndices, squashInputs, tokensToIgnore, secondSeqVec);
|
||||
|
||||
//calculate edit distance
|
||||
size_t firstSize = firstSeqVec.size();
|
||||
|
@ -690,29 +686,29 @@ public:
|
|||
return (ElemType)(wrongSampleNum * totalframeNum / totalSampleNum);
|
||||
}
|
||||
|
||||
float SubstitutionPenalty() const { return m_subPen; }
|
||||
float DeletionPenalty() const { return m_delPen; }
|
||||
float InsertionPenalty() const { return m_insPen; }
|
||||
bool SquashInputs() const { return m_squashInputs; }
|
||||
std::vector<size_t> SamplesToIgnore() const { return m_SamplesToIgnore; }
|
||||
float SubstitutionPenalty() const { return m_SubPen; }
|
||||
float DeletionPenalty() const { return m_DelPen; }
|
||||
float InsertionPenalty() const { return m_InsPen; }
|
||||
bool SquashInputs() const { return m_SquashInputs; }
|
||||
std::vector<size_t> TokensToIgnore() const { return m_tokensToIgnore; }
|
||||
|
||||
private:
|
||||
shared_ptr<Matrix<ElemType>> m_maxIndexes0, m_maxIndexes1;
|
||||
shared_ptr<Matrix<ElemType>> m_maxValues;
|
||||
bool m_squashInputs;
|
||||
float m_subPen;
|
||||
float m_delPen;
|
||||
float m_insPen;
|
||||
std::vector<size_t> m_SamplesToIgnore;
|
||||
bool m_SquashInputs;
|
||||
float m_SubPen;
|
||||
float m_DelPen;
|
||||
float m_InsPen;
|
||||
std::vector<size_t> m_tokensToIgnore;
|
||||
|
||||
// Clear out_SampleSeqVec and extract a vector of samples from the matrix into out_SampleSeqVec.
|
||||
static void ExtractSampleSequence(const Matrix<ElemType>& firstSeq, vector<size_t>& columnIndices, bool squashInputs, const vector<size_t>& samplesToIgnore, std::vector<int>& out_SampleSeqVec)
|
||||
static void ExtractSampleSequence(const Matrix<ElemType>& firstSeq, vector<size_t>& columnIndices, bool squashInputs, const vector<size_t>& tokensToIgnore, std::vector<int>& out_SampleSeqVec)
|
||||
{
|
||||
out_SampleSeqVec.clear();
|
||||
|
||||
// Get the first element in the sequence
|
||||
size_t lastId = (int)firstSeq(0, columnIndices[0]);
|
||||
if (std::find(samplesToIgnore.begin(), samplesToIgnore.end(), lastId) == samplesToIgnore.end())
|
||||
if (std::find(tokensToIgnore.begin(), tokensToIgnore.end(), lastId) == tokensToIgnore.end())
|
||||
out_SampleSeqVec.push_back(lastId);
|
||||
|
||||
// Remaining elements
|
||||
|
@ -725,7 +721,7 @@ private:
|
|||
if (lastId != refId)
|
||||
{
|
||||
lastId = refId;
|
||||
if (std::find(samplesToIgnore.begin(), samplesToIgnore.end(), refId) == samplesToIgnore.end())
|
||||
if (std::find(tokensToIgnore.begin(), tokensToIgnore.end(), refId) == tokensToIgnore.end())
|
||||
out_SampleSeqVec.push_back(refId);
|
||||
}
|
||||
}
|
||||
|
@ -735,7 +731,7 @@ private:
|
|||
for (size_t i = 1; i < columnIndices.size(); i++)
|
||||
{
|
||||
auto refId = (int)firstSeq(0, columnIndices[i]);
|
||||
if (std::find(samplesToIgnore.begin(), samplesToIgnore.end(), refId) == samplesToIgnore.end())
|
||||
if (std::find(tokensToIgnore.begin(), tokensToIgnore.end(), refId) == tokensToIgnore.end())
|
||||
out_SampleSeqVec.push_back(refId);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,6 +8,8 @@
|
|||
#include <string>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
@ -17,59 +19,238 @@
|
|||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <class ElemType>
|
||||
struct MemRequestInfo
|
||||
{
|
||||
DEVICEID_TYPE deviceId; // which device to allocate data
|
||||
shared_ptr<Matrix<ElemType>>*pMatrixPtr; // memory pointer
|
||||
size_t matrixSize; // memory size
|
||||
bool mbScale; // whether the memory shall be scaled by minibatch size
|
||||
int allocStep; // at what step counter memory allocation is requested
|
||||
int releaseStep; // at what step counter memory release is requested
|
||||
int memoryId; // integer indexing the memory buffer ID
|
||||
MemRequestInfo(DEVICEID_TYPE deviceId, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale, int allocStep)
|
||||
:deviceId(deviceId), pMatrixPtr(pMatrixPtr), matrixSize(matrixSize), mbScale(mbScale), allocStep(allocStep), releaseStep(INT_MAX), memoryId(-1)
|
||||
{
|
||||
}
|
||||
void SetReleaseStep(int step) { releaseStep = step; }
|
||||
void SetMemoryId(int id) { memoryId = id; }
|
||||
};
|
||||
|
||||
template <class ElemType>
|
||||
struct greater_than_mem_req_size
|
||||
{
|
||||
inline bool operator() (const MemRequestInfo<ElemType>& info1, const MemRequestInfo<ElemType>& info2)
|
||||
{
|
||||
return (info1.matrixSize > info2.matrixSize);
|
||||
}
|
||||
};
|
||||
|
||||
struct MemAllocInfo
|
||||
{
|
||||
int memoryId;
|
||||
size_t memorySize;
|
||||
vector<pair<int, int>> occupancy;
|
||||
MemAllocInfo(int memoryId, size_t memorySize, vector<pair<int, int>> occ)
|
||||
:memoryId(memoryId), memorySize(memorySize), occupancy(occ)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
// MatrixPool -- class to support memory sharing
|
||||
// Despite the gather general name of this class, it is specifically designed to support the memory sharing of ComputationNodes.
|
||||
// Note: see #define SUPRESS_MEMSHARING below as for how to temporarily disable memory sharing altogether, for debugging
|
||||
class MatrixPool
|
||||
{
|
||||
vector<shared_ptr<Matrix<float>>> m_releasedFloatMatrices;
|
||||
vector<shared_ptr<Matrix<double>>> m_releasedDoubleMatrices;
|
||||
vector<MemRequestInfo<float>> m_memRequestInfoFloatVec;
|
||||
vector<MemRequestInfo<double>> m_memRequestInfoDoubleVec;
|
||||
set<DEVICEID_TYPE> m_deviceIDSet;
|
||||
int m_stepCounter;
|
||||
|
||||
template <class ElemType>
|
||||
vector<shared_ptr<Matrix<ElemType>>>& GetReleasedMatrices();
|
||||
vector<MemRequestInfo<ElemType>>& GetMemRequestInfoVec();
|
||||
|
||||
public:
|
||||
// release here means the matrix can be put back and shared by others
|
||||
template <class ElemType>
|
||||
void Release(shared_ptr<Matrix<ElemType>> freeMatrix)
|
||||
{
|
||||
if (freeMatrix == nullptr || freeMatrix->GetMatrixType() == SPARSE)
|
||||
LogicError("MatrixPool::Release: freeMatrix should not be null or sparse.");
|
||||
//#define SUPRESS_MEMSHARING // #define this to disable memory sharing through this structure
|
||||
// TODO: Make this a runtime option.
|
||||
#ifndef SUPRESS_MEMSHARING
|
||||
vector<shared_ptr<Matrix<ElemType>>>& releasedMatrices = GetReleasedMatrices<ElemType>();
|
||||
#ifdef _DEBUG
|
||||
for (int i = 0; i < releasedMatrices.size(); i++)
|
||||
{
|
||||
if (releasedMatrices[i] == freeMatrix)
|
||||
RuntimeError("MatrixPool::Release: freeMatrix is already in the released pool.");
|
||||
}
|
||||
void ResetStepCounter() { m_stepCounter = 0; };
|
||||
|
||||
#endif
|
||||
releasedMatrices.push_back(freeMatrix);
|
||||
#endif
|
||||
template <class ElemType>
|
||||
void RequestRelease(shared_ptr<Matrix<ElemType>> *pMatrixPtr)
|
||||
{
|
||||
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
|
||||
// iterate through the vector and find the pointer memInfo
|
||||
for (auto& memInfo : memInfoVec)
|
||||
{
|
||||
if (memInfo.pMatrixPtr == pMatrixPtr)
|
||||
{
|
||||
memInfo.SetReleaseStep(m_stepCounter);
|
||||
break;
|
||||
}
|
||||
}
|
||||
m_stepCounter++;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<Matrix<ElemType>> Request(DEVICEID_TYPE deviceId)
|
||||
void RequestAllocate(DEVICEID_TYPE deviceId, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale)
|
||||
{
|
||||
vector<shared_ptr<Matrix<ElemType>>>& releasedMatrices = GetReleasedMatrices<ElemType>();
|
||||
shared_ptr<Matrix<ElemType>> matrixPtr;
|
||||
if (releasedMatrices.empty())
|
||||
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
|
||||
MemRequestInfo<ElemType> memInfo(deviceId, pMatrixPtr, matrixSize, mbScale, m_stepCounter);
|
||||
memInfoVec.push_back(memInfo);
|
||||
m_deviceIDSet.insert(deviceId);
|
||||
m_stepCounter++;
|
||||
|
||||
// assign some temporary pointer, they will be replaced later unless the matrix is sparse
|
||||
*pMatrixPtr = make_shared<Matrix<ElemType>>(deviceId);
|
||||
}
|
||||
|
||||
void OptimizedMemoryAllocation()
|
||||
{
|
||||
// MatrixPool is not templated, so we call both float and double versions here
|
||||
OptimizedMemoryAllocationFunc<float>();
|
||||
OptimizedMemoryAllocationFunc<double>();
|
||||
return;
|
||||
}
|
||||
|
||||
private:
|
||||
bool CheckOverlap(pair<int, int>occ, vector<pair<int, int>>&occVec)
|
||||
{
|
||||
bool bRet = false;
|
||||
for (auto& o : occVec)
|
||||
{
|
||||
matrixPtr = make_shared<Matrix<ElemType>>(deviceId);
|
||||
if (occ.first <= o.second && occ.second >= o.first)
|
||||
{
|
||||
bRet = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
//#define SUPRESS_MEMSHARING // #define this to disable memory sharing by always return true
|
||||
// TODO: Make this a runtime option.
|
||||
#ifdef SUPRESS_MEMSHARING
|
||||
bRet = true;
|
||||
#endif
|
||||
return bRet;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void OptimizedMemoryAllocationFunc()
|
||||
{
|
||||
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
|
||||
if (memInfoVec.empty())
|
||||
return;
|
||||
|
||||
// remove all requests that has been marked as sparse matrices, those will not participate in memory sharing
|
||||
for (auto iter = memInfoVec.begin(); iter != memInfoVec.end(); )
|
||||
{
|
||||
matrixPtr = releasedMatrices.back();
|
||||
releasedMatrices.pop_back();
|
||||
if ((*(iter->pMatrixPtr))->GetMatrixType() == SPARSE)
|
||||
memInfoVec.erase(iter);
|
||||
else
|
||||
iter++;
|
||||
}
|
||||
|
||||
if (!matrixPtr) // this can't really happen
|
||||
LogicError("MatrixPool::Request: failed to get a valid matrix.");
|
||||
// sort the memory request from largest size to smallest
|
||||
std::sort(memInfoVec.begin(), memInfoVec.end(), greater_than_mem_req_size<ElemType>());
|
||||
|
||||
return matrixPtr;
|
||||
for (auto& devId : m_deviceIDSet)
|
||||
{
|
||||
// memAllocInfoVec is a sorted list of memory allocations from smallest to largest in memory size
|
||||
vector<MemAllocInfo> memAllocInfoVec;
|
||||
int memoryCounter = 0;
|
||||
// we start with memory request that is scalable with minibatch size(usually those require larger memory size)
|
||||
for (auto& memInfo : memInfoVec)
|
||||
{
|
||||
// check if it's the proper device
|
||||
if (memInfo.deviceId != devId || !memInfo.mbScale)
|
||||
continue;
|
||||
|
||||
if (!memAllocInfoVec.empty())
|
||||
{
|
||||
// since we assign from highest memory to lowest, every memory that has been allocated can accommodate the
|
||||
// current memory request, unless there is a conflict (overlap)
|
||||
auto iter = memAllocInfoVec.begin();
|
||||
while (iter != memAllocInfoVec.end() && CheckOverlap(make_pair(memInfo.allocStep, memInfo.releaseStep), iter->occupancy))
|
||||
iter++;
|
||||
if (iter == memAllocInfoVec.end())
|
||||
{
|
||||
// no current memory can be assigned, need to create a new one
|
||||
vector<pair<int, int>> occ;
|
||||
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
|
||||
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
|
||||
// insert in the front of the vector to maintain sorted order
|
||||
memAllocInfoVec.insert(memAllocInfoVec.begin(), ma);
|
||||
memInfo.SetMemoryId(memoryCounter);
|
||||
memoryCounter++;
|
||||
}
|
||||
else
|
||||
{
|
||||
iter->occupancy.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
|
||||
memInfo.SetMemoryId(iter->memoryId);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
vector<pair<int, int>> occ;
|
||||
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
|
||||
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
|
||||
memAllocInfoVec.push_back(ma);
|
||||
memInfo.SetMemoryId(memoryCounter);
|
||||
memoryCounter++;
|
||||
}
|
||||
}
|
||||
|
||||
// rescan the request list and this time allocate for those that doesn't depend on minibatch size
|
||||
for (auto& memInfo : memInfoVec)
|
||||
{
|
||||
// check if it's the proper device
|
||||
if (memInfo.deviceId != devId || memInfo.mbScale)
|
||||
continue;
|
||||
|
||||
if (!memAllocInfoVec.empty())
|
||||
{
|
||||
// the memory allocation vector is sorted by size. We find the largest available buffer that doesn't have time overlap
|
||||
auto workingAlloc = memAllocInfoVec.end();
|
||||
for (auto iter = memAllocInfoVec.begin(); iter != memAllocInfoVec.end(); iter++)
|
||||
{
|
||||
if (!CheckOverlap(make_pair(memInfo.allocStep, memInfo.releaseStep), iter->occupancy))
|
||||
workingAlloc = iter;
|
||||
}
|
||||
if (workingAlloc == memAllocInfoVec.end()) // nothing works
|
||||
{
|
||||
vector<pair<int, int>> occ;
|
||||
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
|
||||
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
|
||||
memAllocInfoVec.push_back(ma); // add as the last one
|
||||
memInfo.SetMemoryId(memoryCounter);
|
||||
memoryCounter++;
|
||||
}
|
||||
else
|
||||
{
|
||||
workingAlloc->occupancy.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
|
||||
memInfo.SetMemoryId(workingAlloc->memoryId);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
vector<pair<int, int>> occ;
|
||||
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
|
||||
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
|
||||
memAllocInfoVec.push_back(ma);
|
||||
memInfo.SetMemoryId(memoryCounter);
|
||||
memoryCounter++;
|
||||
}
|
||||
}
|
||||
|
||||
// now assign the actual pointers
|
||||
for (int i = 0; i < memoryCounter; i++)
|
||||
{
|
||||
auto matrixPtr = make_shared<Matrix<ElemType>>(devId);
|
||||
if (!matrixPtr) // this can't really happen, because we haven't started allocating memory yet
|
||||
LogicError("MatrixPool: failed to get a valid matrix.");
|
||||
for (auto& memInfo : memInfoVec)
|
||||
{
|
||||
if (memInfo.deviceId == devId && memInfo.memoryId == i)
|
||||
*memInfo.pMatrixPtr = matrixPtr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -149,6 +149,7 @@ DeclareUnaryElementWiseWithOpCodeNode(Floor, Floor, None,
|
|||
DeclareUnaryElementWiseWithOpCodeNode(Log, Log, ElementwiseProductWithLogDerivativeFromOutput, binaryWithOutputGradient);
|
||||
DeclareUnaryElementWiseWithOpCodeNode(Negate, Negate, Negate, unaryGradient);
|
||||
DeclareUnaryElementWiseWithOpCodeNode(Pass, Copy, Copy, unaryGradient);
|
||||
DeclareUnaryElementWiseWithOpCodeNode(LabelsToGraph, Copy, Copy, unaryGradient);
|
||||
DeclareUnaryElementWiseWithOpCodeNode(Reciprocal, Reciprocal, ElementwiseProductWithReciprocalDerivative, binaryWithOutputGradient);
|
||||
DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear, LinearRectifier, ElementwiseProductWithLinearRectifierDerivativeFromOutput, binaryWithOutputGradient);
|
||||
DeclareUnaryElementWiseWithOpCodeNode(Sigmoid, Sigmoid, ElementwiseProductWithSigmoidDerivativeFromOutput, binaryWithOutputGradient);
|
||||
|
|
|
@ -75,11 +75,9 @@ public:
|
|||
ReleaseMatrixToPool(m_transposedOutput, matrixPool);
|
||||
ReleaseMatrixToPool(m_transposedDInput, matrixPool);
|
||||
ReleaseMatrixToPool(m_transposedDOutput, matrixPool);
|
||||
#if 0
|
||||
ReleaseMatrixToPool(m_reserve, matrixPool);
|
||||
ReleaseMatrixToPool(m_workspace, matrixPool);
|
||||
ReleaseMatrixToPool(m_packingIndex, matrixPool);
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const { return false; }
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include "Basics.h"
|
||||
#include "ComputationNode.h"
|
||||
#include "gammacalculation.h"
|
||||
#include "NonlinearityNodes.h"
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
@ -611,7 +612,7 @@ public:
|
|||
RequestMatrixFromPool(m_gammaFromLattice, matrixPool);
|
||||
}
|
||||
|
||||
// request matrices needed to do node function value evaluation
|
||||
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
||||
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
|
@ -722,10 +723,7 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
return false;
|
||||
}
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
||||
|
||||
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
|
||||
{
|
||||
|
@ -765,4 +763,192 @@ public:
|
|||
template class DummyCriterionNode<float>;
|
||||
template class DummyCriterionNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ForwardBackwardNode (graph, prediction, delayConstraint)
|
||||
// CTC training criterion, primarily based on the paper "Connectionist Temporal Classification: Labelling Unsegmented
|
||||
// Sequence Data with Recurrent Neural Networks", ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
|
||||
//
|
||||
// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference. This using the original time information to enforce that CTC tokens only get aligned within a time margin.
|
||||
// Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy.
|
||||
// delayConstraint=-1 means no constraint
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class ForwardBackwardNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<2>
|
||||
{
|
||||
typedef ComputationNodeNonLooping<ElemType> Base;
|
||||
UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName()
|
||||
{
|
||||
return L"ForwardBackward";
|
||||
}
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(ForwardBackwardNode);
|
||||
ForwardBackwardNode(DEVICEID_TYPE deviceId, const wstring & name, int blankTokenId=INT_MIN, int delayConstraint=-1) :
|
||||
Base(deviceId, name), m_blankTokenId(blankTokenId), m_delayConstraint(delayConstraint)
|
||||
{
|
||||
}
|
||||
|
||||
// Compute gradients to input observations, the weights to the observations, and the class log posterior probabilites
|
||||
virtual void BackpropToNonLooping(size_t inputIndex) override
|
||||
{
|
||||
// Left node must be a scalar
|
||||
if (inputIndex == 0) //left derivative
|
||||
{
|
||||
BackpropToLeft(*m_logSoftmaxOfRight, InputRef(inputIndex).Gradient(), Gradient());
|
||||
}
|
||||
else if (inputIndex == 1)
|
||||
{
|
||||
FrameRange frameRange(InputRef(0).GetMBLayout());
|
||||
BackpropToRight(*m_softmaxOfRight, InputRef(inputIndex).Gradient(), Gradient(), *m_CTCposterior);
|
||||
InputRef(inputIndex).MaskMissingGradientColumnsToZero(frameRange);
|
||||
}
|
||||
else
|
||||
RuntimeError("ForwardBackwardNode criterion expects only two inputs: labels and network output.");
|
||||
}
|
||||
|
||||
void BackpropToLeft(const Matrix<ElemType>& logSoftmaxOfRight, Matrix<ElemType>& inputGradientValues,
|
||||
const Matrix<ElemType>& gradientValues)
|
||||
{
|
||||
#if DUMPOUTPUT
|
||||
logSoftmaxOfRight.Print("ForwardBackwardNode Partial-logSoftmaxOfRight");
|
||||
gradientValues.Print("ForwardBackwardNode Partial-gradientValues");
|
||||
inputGradientValues.Print("ForwardBackwardNode Partial-Left-in");
|
||||
#endif
|
||||
|
||||
Matrix<ElemType>::ScaleAndAdd(-gradientValues.Get00Element(), logSoftmaxOfRight, inputGradientValues);
|
||||
|
||||
#if DUMPOUTPUT
|
||||
inputGradientValues.Print("ForwardBackwardNode Partial-Left-out");
|
||||
#endif
|
||||
}
|
||||
|
||||
void BackpropToRight(const Matrix<ElemType>& softmaxOfRight, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues,
|
||||
const Matrix<ElemType> &CTCposterior)
|
||||
{
|
||||
#if DUMPOUTPUT
|
||||
softmaxOfRight.Print("ForwardBackwardNode Partial-softmaxOfRight");
|
||||
inputFunctionValues.Print("ForwardBackwardNode Partial-inputFunctionValues");
|
||||
gradientValues.Print("ForwardBackwardNode Partial-gradientValues");
|
||||
inputGradientValues.Print("ForwardBackwardNode Partial-Right-in");
|
||||
#endif
|
||||
// inputGradientValues+= gradientValues*(softmaxOfRight - CTCposterior)
|
||||
Matrix<ElemType>::AddScaledDifference(gradientValues, softmaxOfRight, CTCposterior, inputGradientValues);
|
||||
|
||||
#if DUMPOUTPUT
|
||||
inputGradientValues.Print("ForwardBackwardNode Partial-Right");
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void ForwardPropNonLooping() override
|
||||
{
|
||||
m_logSoftmaxOfRight->AssignLogSoftmaxOf(InputRef(1).Value(), true);
|
||||
m_softmaxOfRight->SetValue(*m_logSoftmaxOfRight);
|
||||
m_softmaxOfRight->InplaceExp();
|
||||
|
||||
m_CTCposterior->SwitchToMatrixType(m_softmaxOfRight->GetMatrixType(), m_softmaxOfRight->GetFormat(), false);
|
||||
m_CTCposterior->Resize(m_softmaxOfRight->GetNumRows(), m_softmaxOfRight->GetNumCols());
|
||||
|
||||
FrameRange fr(InputRef(0).GetMBLayout());
|
||||
InputRef(0).ValueFor(fr).VectorMax(*m_maxIndexes, *m_maxValues, true);
|
||||
// compute CTC score
|
||||
m_GammaCal.doCTC(Value(), *m_logSoftmaxOfRight, *m_maxIndexes, *m_maxValues, *m_CTCposterior, InputRef(0).GetMBLayout(), m_blankTokenId, m_delayConstraint);
|
||||
|
||||
#if NANCHECK
|
||||
functionValues.HasNan("ForwardBackwardNode");
|
||||
#endif
|
||||
#if DUMPOUTPUT
|
||||
functionValues.Print("ForwardBackwardNode");
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
m_pMBLayout = nullptr; // no layout
|
||||
|
||||
if (isFinalValidationPass)
|
||||
{
|
||||
if (!(Input(0)->GetSampleMatrixNumRows() == Input(1)->GetSampleMatrixNumRows() && // match vector dimension
|
||||
Input(0)->HasMBLayout() &&
|
||||
Input(0)->GetMBLayout() == Input(1)->GetMBLayout()))
|
||||
{
|
||||
LogicError("The Matrix dimension in the ForwardBackwardNode operation does not match.");
|
||||
}
|
||||
|
||||
auto leftNode = dynamic_pointer_cast<LabelsToGraphNode<ElemType>>(Input(0));
|
||||
if (!leftNode)
|
||||
LogicError("ForwardBackwardNode: Please pass LabelsToGraph(labels) for second argument");
|
||||
}
|
||||
|
||||
SetDims(TensorShape(1), false);
|
||||
}
|
||||
|
||||
virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
|
||||
{
|
||||
Base::CopyTo(nodeP, newName, flags);
|
||||
if (flags & CopyNodeFlags::copyNodeValue)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<ForwardBackwardNode<ElemType>>(nodeP);
|
||||
|
||||
node->m_logSoftmaxOfRight->SetValue(*m_logSoftmaxOfRight);
|
||||
node->m_softmaxOfRight->SetValue(*m_softmaxOfRight);
|
||||
node->m_CTCposterior->SetValue(*m_CTCposterior);
|
||||
node->m_maxIndexes->SetValue(*m_maxIndexes);
|
||||
node->m_maxValues->SetValue(*m_maxValues);
|
||||
node->m_delayConstraint = m_delayConstraint;
|
||||
}
|
||||
}
|
||||
|
||||
// request matrices needed to do node function value evaluation
|
||||
virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
||||
RequestMatrixFromPool(m_logSoftmaxOfRight, matrixPool);
|
||||
RequestMatrixFromPool(m_softmaxOfRight, matrixPool);
|
||||
RequestMatrixFromPool(m_CTCposterior, matrixPool);
|
||||
RequestMatrixFromPool(m_maxIndexes, matrixPool);
|
||||
RequestMatrixFromPool(m_maxValues, matrixPool);
|
||||
}
|
||||
|
||||
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
|
||||
ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
|
||||
ReleaseMatrixToPool(m_CTCposterior, matrixPool);
|
||||
ReleaseMatrixToPool(m_maxIndexes, matrixPool);
|
||||
ReleaseMatrixToPool(m_maxValues, matrixPool);
|
||||
}
|
||||
|
||||
virtual void UpdateFunctionMBSize() override
|
||||
{
|
||||
Base::UpdateFunctionMBSize();
|
||||
|
||||
size_t cols = Input(0)->Value().GetNumCols();
|
||||
m_maxIndexes->Resize(1, cols);
|
||||
m_maxValues->Resize(1, cols);
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
|
||||
shared_ptr<Matrix<ElemType>> m_logSoftmaxOfRight;
|
||||
shared_ptr<Matrix<ElemType>> m_softmaxOfRight;
|
||||
shared_ptr<Matrix<ElemType>> m_CTCposterior;
|
||||
shared_ptr<Matrix<ElemType>> m_maxIndexes;
|
||||
shared_ptr<Matrix<ElemType>> m_maxValues;
|
||||
|
||||
msra::lattices::GammaCalculation<ElemType> m_GammaCal;
|
||||
int m_blankTokenId;
|
||||
int m_delayConstraint;
|
||||
};
|
||||
|
||||
template class ForwardBackwardNode<float>;
|
||||
template class ForwardBackwardNode<double>;
|
||||
|
||||
} } }
|
||||
|
|
|
@ -219,6 +219,14 @@ public:
|
|||
RequestMatrixFromPool(m_softmaxOfRight, matrixPool);
|
||||
}
|
||||
|
||||
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
||||
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
|
||||
ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
|
||||
}
|
||||
|
||||
protected:
|
||||
shared_ptr<Matrix<ElemType>> m_logSoftmaxOfRight;
|
||||
shared_ptr<Matrix<ElemType>> m_softmaxOfRight;
|
||||
|
|
|
@ -41,7 +41,6 @@ void CNTKEvalBase<ElemType>::Init(const std::string& config)
|
|||
CPUMatrix<ElemType>::SetNumThreads(nThreads);
|
||||
|
||||
Globals::SetShareNodeValueMatrices(m_config(L"shareNodeValueMatrices", true));
|
||||
Globals::SetHyperCompressMemory(m_config(L"hyperCompressMemory", false));
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -60,8 +60,8 @@
|
|||
</ClCompile>
|
||||
<Link>
|
||||
<AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>EvalDLL.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<DelayLoadDLLs>EvalDll.dll</DelayLoadDLLs>
|
||||
<AdditionalDependencies>EvalDLL.lib;Math.lib;Common.lib;$(MSMPI_LIB64)msmpi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<DelayLoadDLLs>EvalDll.dll;Math.dll</DelayLoadDLLs>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(DebugBuild)">
|
||||
|
|
|
@ -5873,6 +5873,166 @@ void CPUMatrix<ElemType>::RCRFBackwardCompute(const CPUMatrix<ElemType>& alpha,
|
|||
}
|
||||
};
|
||||
|
||||
template<class ElemType>
|
||||
CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignCTCScore(
|
||||
const CPUMatrix<ElemType>& prob, CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta,
|
||||
const CPUMatrix<ElemType>& phoneSeq, const CPUMatrix<ElemType>& phoneBoundary, ElemType &totalScore, const std::vector<size_t>& uttMap, const std::vector<size_t> & uttBeginFrame, const std::vector<size_t> & uttFrameNum,
|
||||
const std::vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise)
|
||||
{
|
||||
// Column wise representation of sequences in input matrices (each column is one sequence/utterance)
|
||||
if (isColWise)
|
||||
{
|
||||
vector<size_t> curPhoneSeq;
|
||||
|
||||
auto &us = *this;
|
||||
size_t s, s2;
|
||||
size_t senoneid, t;
|
||||
ElemType ascore;
|
||||
double x, y;
|
||||
size_t senonenum, frameNum;
|
||||
|
||||
for (size_t uttId = 0;uttId < uttFrameNum.size(); uttId++) {
|
||||
senonenum = uttPhoneNum[uttId];
|
||||
frameNum = uttFrameNum[uttId];
|
||||
|
||||
// Populate utterance
|
||||
// Using loop instead of memcpy for clarity
|
||||
curPhoneSeq.reserve(senonenum);
|
||||
for (size_t i =0;i < senonenum;i++)
|
||||
curPhoneSeq.push_back(phoneSeq(i, uttId));
|
||||
|
||||
if (frameNum > 1)
|
||||
{
|
||||
//initialize alpha
|
||||
for (s = 1; s < 3; s++)
|
||||
{
|
||||
senoneid = curPhoneSeq[s];
|
||||
alpha(s, 0) = prob(senoneid, 0);
|
||||
}
|
||||
alpha(senonenum - 1, 0) = LZERO;
|
||||
//initialize beta
|
||||
for (s = senonenum - 3; s < senonenum - 1; s++)
|
||||
{
|
||||
senoneid = curPhoneSeq[s];
|
||||
beta(s, frameNum - 1) = prob(senoneid, frameNum - 1);
|
||||
}
|
||||
beta(senonenum - 1, frameNum - 1) = LZERO;
|
||||
|
||||
//cal alpha
|
||||
for (t = 1; t < frameNum; t++)
|
||||
{
|
||||
for (s = 1; s < senonenum - 1; s++)
|
||||
{
|
||||
senoneid = curPhoneSeq[s];
|
||||
x = LZERO;
|
||||
for (s2 = s - 1; s2 <= s; s2++)
|
||||
{
|
||||
if (s2 > 0)
|
||||
{
|
||||
y = alpha(s2, t - 1);
|
||||
x = LogAddD(x, y);
|
||||
}
|
||||
}
|
||||
|
||||
if (senoneid != prob.GetNumRows() - 1 && s - 2 > 0 && senoneid != curPhoneSeq[s - 2])
|
||||
{
|
||||
y = alpha(s - 2, t - 1);
|
||||
x = LogAddD(x, y);
|
||||
}
|
||||
if (senoneid != SIZE_MAX)
|
||||
ascore = prob(senoneid, t);
|
||||
else
|
||||
ascore = 0;
|
||||
alpha(s, t) = (float)x + ascore;
|
||||
}
|
||||
|
||||
}
|
||||
//exit senone
|
||||
x = LZERO;
|
||||
for (s2 = senonenum - 3; s2 < senonenum - 1; s2++)
|
||||
{
|
||||
y = alpha(s2, frameNum - 1);
|
||||
x = LogAddD(x, y);
|
||||
}
|
||||
alpha(senonenum - 1, frameNum - 1) = (float)x;
|
||||
|
||||
totalScore = -alpha(senonenum - 1, frameNum - 1);
|
||||
|
||||
//cal beta
|
||||
for (t = frameNum - 2; t >= 0; t--)
|
||||
{
|
||||
|
||||
for (s = 1; s < senonenum - 1; s++)
|
||||
{
|
||||
senoneid = curPhoneSeq[s];
|
||||
x = LZERO;
|
||||
for (s2 = s; s2 <= s + 1; s2++)
|
||||
{
|
||||
if (s2 < senonenum - 1)
|
||||
{
|
||||
y = beta(s2, t + 1);
|
||||
x = LogAddD(x, y);
|
||||
}
|
||||
}
|
||||
if (senoneid != prob.GetNumRows() - 1 && s + 2 < senonenum - 1 && senoneid != curPhoneSeq[s + 2])
|
||||
{
|
||||
y = beta(s + 2, t + 1);
|
||||
x = LogAddD(x, y);
|
||||
}
|
||||
|
||||
if (senoneid != SIZE_MAX)
|
||||
ascore = prob(senoneid, t);
|
||||
else
|
||||
ascore = 0;
|
||||
beta(s, t) = (float)x + ascore;
|
||||
|
||||
}
|
||||
if (t == 0)
|
||||
break;
|
||||
}
|
||||
//entry senone
|
||||
x = LZERO;
|
||||
for (s2 = 1; s2 < 3; s2++)
|
||||
{
|
||||
y = beta(s2, 0);
|
||||
x = LogAddD(x, y);
|
||||
}
|
||||
beta(0, 0) = (float)x;
|
||||
for (t = 0; t < frameNum; t++)
|
||||
{
|
||||
//cal zt
|
||||
double Zt = LZERO;
|
||||
for (s = 1; s < senonenum - 1; s++)
|
||||
{
|
||||
senoneid = curPhoneSeq[s];
|
||||
Zt = LogAddD(Zt, (alpha(s, t) + beta(s, t) - prob(senoneid, t)));
|
||||
}
|
||||
|
||||
for (s = 1; s < senonenum - 1; s++)
|
||||
{
|
||||
senoneid = curPhoneSeq[s];
|
||||
if (senoneid != SIZE_MAX)
|
||||
{
|
||||
ElemType logoccu = alpha(s, t) + beta(s, t) - prob(senoneid, t) - (float)Zt;
|
||||
if (logoccu < LOG_OF_EPS_IN_LOG)
|
||||
us(senoneid, t) += 0.0f;
|
||||
else
|
||||
us(senoneid, t) += exp(logoccu);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
|
||||
}
|
||||
else {
|
||||
LogicError("Only ColWise minibatch layout is supported.");
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// the kernel function for RCRF backward computation
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::_rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
|
||||
|
|
|
@ -231,6 +231,7 @@ public:
|
|||
// sequence training
|
||||
CPUMatrix<ElemType>& DropFrame(const CPUMatrix<ElemType>& label, const CPUMatrix<ElemType>& gamma, const ElemType& threshhold);
|
||||
CPUMatrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const CPUMatrix<ElemType>& label, const CPUMatrix<ElemType>& dnnoutput, const CPUMatrix<ElemType>& gamma, ElemType alpha);
|
||||
CPUMatrix<ElemType>& AssignCTCScore(const CPUMatrix<ElemType>& prob, CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta, const CPUMatrix<ElemType>& phoneSeq, const CPUMatrix<ElemType>& phoneBoundary, ElemType &totalScore, const vector<size_t>& uttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise);
|
||||
CPUMatrix<ElemType>& InplaceSqrt();
|
||||
CPUMatrix<ElemType>& AssignSqrtOf(const CPUMatrix<ElemType>& a);
|
||||
|
||||
|
|
|
@ -41,8 +41,6 @@ typedef unsigned char byte;
|
|||
#define GPUSPARSE_INDEX_TYPE int // cuSparse only supports int array indexes
|
||||
#define CPUSPARSE_INDEX_TYPE int // to be consistent with cuSparse but limited the possible size of the matrix.
|
||||
|
||||
#define MEM_MAX_LIMIT_TIMES 2 // The maximum times allowed a cached memory block allocated to a request
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
MATH_API void SetMathLibTraceLevel(int traceLevel);
|
||||
|
@ -214,158 +212,6 @@ enum MatrixFlags
|
|||
matrixFlagSetValueOnDevice = 1 << bitPosSetValueOnDevice, // SetValue() call has a buffer that is already on the device
|
||||
};
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// BufferManagement -- to control the allocation and release of memory
|
||||
//
|
||||
// 1. The goal of buffer management
|
||||
// The best way to save memory is releasing memory right after no longer used in the rest of the mini-batch, which makes
|
||||
// the extra cost on memory operation and slows down the speed. An option to solve that is building the static link between
|
||||
// all nodes in pre-computing process and making memory re-use in the runtime, known as shared node value matrices in CNTK.
|
||||
// The other option is using a buffer pool to take over the allocation and release request. Whereas the physical operation on
|
||||
// memory, logical operation will make nearly no cost on allocation or release. Since the second option, achieved as
|
||||
// BufferManagement below, could control all the memory operation, including some trivial ones, like the workspace in convolutions,
|
||||
// and more flexible, allocating based on size and being easy to implement new algorithm, it is usually more powerful than the
|
||||
// first method.
|
||||
// 2. How it works?
|
||||
// First, it should be called in Resize function. In Resize function, using Request and LogicalReleaseFunction to replace the original
|
||||
// request and release functions. Since BufferManagement is singleton for deviceId, just call the GetManagementInstance. And in Resize,
|
||||
// there is a flag named growthOnly, which will request only the size increases to save the allocation cost. In the case, since the
|
||||
// buffer pool, nearly no cost on allocation, the growth only will be disable in BufferManagement mode.
|
||||
// -----------------------------------------------------------------------
|
||||
class BufferManagement
|
||||
{
|
||||
private:
|
||||
BufferManagement() = default;
|
||||
|
||||
// Disable all the copy & move functions to keep the instance safely
|
||||
DISABLE_COPY_AND_MOVE(BufferManagement);
|
||||
|
||||
public:
|
||||
static BufferManagement& GetManagerInstance(DEVICEID_TYPE deviceId)
|
||||
{
|
||||
static std::mutex instancLock;
|
||||
auto instance = m_instances.find(deviceId);
|
||||
if (instance == m_instances.end())
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(instancLock);
|
||||
if (instance == m_instances.end())
|
||||
{
|
||||
instance = m_instances.insert(std::make_pair(deviceId, std::unique_ptr<BufferManagement>(
|
||||
new BufferManagement()))).first;
|
||||
instance->second->m_deviceId = deviceId;
|
||||
instance->second->m_totalManageSize = 0;
|
||||
instance->second->m_totalAllocSize = 0;
|
||||
}
|
||||
}
|
||||
return *(instance->second);
|
||||
}
|
||||
|
||||
// for requesting, find in buffer container first, if failed, allocate a new one
|
||||
// if allocating from buffer, the size will be modified to the real buffer size
|
||||
template<class ElemType>
|
||||
ElemType* RequestBuffer(size_t& size)
|
||||
{
|
||||
ElemType* bufferPtr = nullptr;
|
||||
auto& bufferContainer = BufferContainer<ElemType>();
|
||||
|
||||
// simply allocating based on size, more efficient and complex algorithm could be implemented here
|
||||
auto bufferHint = bufferContainer.lower_bound(size);
|
||||
if (bufferHint != bufferContainer.end() && bufferHint->first < size * MEM_MAX_LIMIT_TIMES)
|
||||
{
|
||||
bufferPtr = bufferHint->second;
|
||||
size = bufferHint->first;
|
||||
m_totalManageSize -= size;
|
||||
bufferContainer.erase(bufferHint);
|
||||
return bufferPtr;
|
||||
}
|
||||
|
||||
if (m_deviceId >= 0) {
|
||||
#ifndef CPUONLY
|
||||
auto deviceSize = TracingGPUMemoryAllocator::GetFreeAndTotalMemoryInMBs(m_deviceId);
|
||||
float freeMemoryRatio = (float)deviceSize.first / deviceSize.second;
|
||||
if (freeMemoryRatio < 0.05f || (deviceSize.first << 20) / sizeof(ElemType) < size)
|
||||
{
|
||||
PhysicalReleaseAllBuffer<ElemType>();
|
||||
}
|
||||
bufferPtr = TracingGPUMemoryAllocator::Allocate<ElemType>(m_deviceId, size);
|
||||
m_totalAllocSize += size;
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
// first, try no-throw allocation.
|
||||
// if failed, empty the buffer and re-try a throwing allocation
|
||||
// if failed again, let system throw the bad_alloc exception
|
||||
bufferPtr = new (std::nothrow) ElemType[size];
|
||||
if (!bufferPtr)
|
||||
{
|
||||
PhysicalReleaseAllBuffer<ElemType>();
|
||||
bufferPtr = new ElemType[size];
|
||||
}
|
||||
m_totalAllocSize += size;
|
||||
}
|
||||
|
||||
return bufferPtr;
|
||||
}
|
||||
|
||||
// insert the header of buffer into the buffer container
|
||||
template<class ElemType>
|
||||
void LogicalReleaseBuffer(ElemType* buffer, size_t size)
|
||||
{
|
||||
auto& bufferContainer = BufferContainer<ElemType>();
|
||||
bufferContainer.insert(std::make_pair(size, buffer));
|
||||
m_totalManageSize += size;
|
||||
}
|
||||
|
||||
// physical release the buffer
|
||||
template<class ElemType>
|
||||
void PhysicalReleaseBuffer(ElemType* buffer)
|
||||
{
|
||||
if (m_deviceId >= 0)
|
||||
{
|
||||
#ifndef CPUONLY
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(m_deviceId, buffer, false);
|
||||
#endif
|
||||
}
|
||||
else {
|
||||
delete[] buffer;
|
||||
}
|
||||
}
|
||||
|
||||
// empty all the cached buffer
|
||||
template<class ElemType>
|
||||
void PhysicalReleaseAllBuffer()
|
||||
{
|
||||
auto& bufferContainer = BufferContainer<ElemType>();
|
||||
|
||||
for (auto& iter : bufferContainer)
|
||||
{
|
||||
PhysicalReleaseBuffer<ElemType>(iter.second);
|
||||
}
|
||||
|
||||
bufferContainer.clear();
|
||||
m_totalManageSize = 0;
|
||||
}
|
||||
|
||||
private:
|
||||
static std::unordered_map<DEVICEID_TYPE, std::unique_ptr<BufferManagement>> m_instances;
|
||||
|
||||
template <class ElemType>
|
||||
std::multimap<size_t, ElemType*>& BufferContainer();
|
||||
DEVICEID_TYPE m_deviceId;
|
||||
size_t m_totalManageSize;
|
||||
size_t m_totalAllocSize;
|
||||
|
||||
// map to store all the temp buffer handle
|
||||
std::multimap<size_t, float*> m_bufferFloatContainer;
|
||||
std::multimap<size_t, double*> m_bufferDoubleContainer;
|
||||
std::multimap<size_t, char*> m_bufferCharContainer;
|
||||
std::multimap<size_t, short*> m_bufferShortContainer;
|
||||
std::multimap<size_t, int*> m_bufferIntContainer;
|
||||
};
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// BaseMatrixStorage -- base class for all matrix types (CPU, GPU) x (dense, sparse)
|
||||
// -----------------------------------------------------------------------
|
||||
|
|
|
@ -260,8 +260,6 @@ protected:
|
|||
}
|
||||
|
||||
// Only supported in MatrixPool enable
|
||||
// NOTE: it's unnecessary to keep the workspace.
|
||||
workspace.Resize(0, 0);
|
||||
CUDNN_CALL(err);
|
||||
}
|
||||
|
||||
|
@ -304,7 +302,6 @@ protected:
|
|||
if (CUDNN_STATUS_SUCCESS == err2)
|
||||
err = CUDNN_STATUS_SUCCESS;
|
||||
}
|
||||
workspace.Resize(0, 0);
|
||||
CUDNN_CALL(err);
|
||||
}
|
||||
|
||||
|
@ -347,7 +344,6 @@ protected:
|
|||
if (CUDNN_STATUS_SUCCESS == err2)
|
||||
err = CUDNN_STATUS_SUCCESS;
|
||||
}
|
||||
workspace.Resize(0, 0);
|
||||
CUDNN_CALL(err);
|
||||
}
|
||||
|
||||
|
|
|
@ -1531,42 +1531,35 @@ void GPUMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
|
||||
void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly)
|
||||
{
|
||||
if (GetNumRows() != numRows || GetNumCols() != numCols)
|
||||
Resize(numRows, numCols, growOnly, cachedResize);
|
||||
Resize(numRows, numCols, growOnly);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
|
||||
void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly)
|
||||
{
|
||||
if (GetNumRows() == numRows && GetNumCols() == numCols)
|
||||
return;
|
||||
|
||||
VerifyResizable(__FUNCTION__);
|
||||
bool isForceResize = (!growOnly) || cachedResize;
|
||||
|
||||
size_t numElements = numRows * numCols;
|
||||
if (numElements > GetSizeAllocated() || // grow allocation
|
||||
(isForceResize && numElements != GetSizeAllocated())) // shrink allocation if not growOnly
|
||||
(!growOnly && numElements != GetSizeAllocated())) // shrink allocation if not growOnly
|
||||
{
|
||||
// If the buffer exists, free it before allocate
|
||||
if (Buffer())
|
||||
{
|
||||
if (cachedResize)
|
||||
BufferManagement::GetManagerInstance(GetComputeDeviceId()).LogicalReleaseBuffer<ElemType>(Buffer(), GetSizeAllocated());
|
||||
else
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
|
||||
}
|
||||
|
||||
// reallocate buffer if numElements > 0
|
||||
ElemType* pArray = nullptr;
|
||||
if (numElements > 0)
|
||||
{
|
||||
if (cachedResize)
|
||||
pArray = BufferManagement::GetManagerInstance(GetComputeDeviceId()).RequestBuffer<ElemType>(numElements);
|
||||
else
|
||||
pArray = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), numRows, numCols);
|
||||
pArray = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), numRows, numCols);
|
||||
}
|
||||
|
||||
SetBuffer(pArray, numElements * sizeof(ElemType));
|
||||
|
@ -2374,7 +2367,9 @@ ElemType GPUMatrix<ElemType>::AbsoluteMax() const
|
|||
int resInd = 0;
|
||||
cublasIdamax(cuHandle, (CUDA_LONG)GetNumElements(), reinterpret_cast<double*>(Data()), 1, &resInd);
|
||||
resInd--;
|
||||
|
||||
CUDA_CALL(cudaMemcpy(reinterpret_cast<double*>(&res), Data() + resInd, sizeof(double), cudaMemcpyDeviceToHost));
|
||||
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
@ -2951,7 +2946,30 @@ void GPUMatrix<ElemType>::Print(const char* /*matrixName*/, size_t /*rowStart*/,
|
|||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::Print(const char* matrixName /*=nullptr*/) const
|
||||
{
|
||||
Print(matrixName, 0, GetNumRows() - 1, 0, GetNumCols() - 1);
|
||||
size_t elemCount = GetNumRows() * GetNumCols();
|
||||
vector<ElemType> localCopy(elemCount);
|
||||
cudaMemcpy(localCopy.data(), Data(), elemCount * sizeof(ElemType), cudaMemcpyDeviceToHost);
|
||||
|
||||
fprintf(stderr, "\n###### ");
|
||||
if (matrixName != nullptr)
|
||||
fprintf(stderr, "%s ", matrixName);
|
||||
fprintf(stderr, "(%lu, %lu) ######\n\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols());
|
||||
|
||||
if (IsEmpty())
|
||||
{
|
||||
fprintf(stderr, "(empty)\n");
|
||||
return;
|
||||
}
|
||||
|
||||
// CNTK is using column-major storage
|
||||
for (size_t i = 0; i < GetNumRows(); i++)
|
||||
{
|
||||
for (size_t j = 0; j < GetNumCols(); j++)
|
||||
{
|
||||
fprintf(stderr, "%.10f\t", localCopy[i + j * GetNumRows()]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
//helpfer function used for convolution neural network
|
||||
|
@ -4253,6 +4271,117 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::GetARowByIndex(const GPUMatrix<ElemTyp
|
|||
return *this;
|
||||
}
|
||||
|
||||
// Calculate CTC score
|
||||
// prob (input): the posterior output from the network
|
||||
// alpha, beta (output): alpha and beta for forward-backward calculation.
|
||||
// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance
|
||||
// phoneBoundary (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance
|
||||
// totalScore (output): total CTC score
|
||||
// uttToChanInd (input): map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
|
||||
// uttBeginFrame(input): the positon of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance.
|
||||
// uttFrameNum (input): the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch
|
||||
// uttPhoneNum (input): the phone number of each utterance. The size of this vector = the number of all utterances in this minibatch
|
||||
// numParallelSequences (input): channel number in this minibatch
|
||||
// maxFrameNum (input): the maximum channel frame number
|
||||
// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference.
|
||||
// Alpha and Beta scores outside of the delay boundary are set to zero.
|
||||
// Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy
|
||||
// delayConstraint=-1 means no constraint
|
||||
template<class ElemType>
|
||||
GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignCTCScore(const GPUMatrix<ElemType>& prob,
|
||||
GPUMatrix<ElemType>& alpha,
|
||||
GPUMatrix<ElemType>& beta,
|
||||
const GPUMatrix<ElemType> phoneSeq,
|
||||
const GPUMatrix<ElemType> phoneBoundary,
|
||||
ElemType &totalScore,
|
||||
const std::vector<size_t>& uttToChanInd,
|
||||
const std::vector<size_t> & uttBeginFrame,
|
||||
const std::vector<size_t> & uttFrameNum,
|
||||
const std::vector<size_t> & uttPhoneNum,
|
||||
const size_t numParallelSequences,
|
||||
const size_t maxFrameNum, const int delayConstraint, const bool isColWise)
|
||||
{
|
||||
if (isColWise)
|
||||
{
|
||||
PrepareDevice();
|
||||
// Total number of phones
|
||||
long totalPhoneNum = prob.GetNumRows();
|
||||
size_t uttNum = uttFrameNum.size();
|
||||
|
||||
// Max number of phones in utterances in this minibatch
|
||||
size_t maxPhoneNum = phoneSeq.GetNumRows();
|
||||
|
||||
size_t *gpuFrameNum;
|
||||
CUDA_CALL(cudaMalloc((void **)&gpuFrameNum, uttNum * sizeof(size_t)));
|
||||
CUDA_CALL(cudaMemcpy(gpuFrameNum, uttFrameNum.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
|
||||
|
||||
size_t *gpuPhoneNum;
|
||||
CUDA_CALL(cudaMalloc((void **)&gpuPhoneNum, uttNum * sizeof(size_t)));
|
||||
CUDA_CALL(cudaMemcpy(gpuPhoneNum, uttPhoneNum.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
|
||||
|
||||
size_t *gpuBeginFrame;
|
||||
CUDA_CALL(cudaMalloc((void **)&gpuBeginFrame, uttNum * sizeof(size_t)));
|
||||
CUDA_CALL(cudaMemcpy(gpuBeginFrame, uttBeginFrame.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
|
||||
|
||||
size_t *gpuUttToChanInd;
|
||||
CUDA_CALL(cudaMalloc((void **)&gpuUttToChanInd, uttNum * sizeof(size_t)));
|
||||
CUDA_CALL(cudaMemcpy(gpuUttToChanInd, uttToChanInd.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
|
||||
|
||||
ElemType *gpuScores;
|
||||
CUDA_CALL(cudaMalloc((void **)&gpuScores, uttNum * sizeof(ElemType)));
|
||||
|
||||
cudaEvent_t done = nullptr;
|
||||
CUDA_CALL(cudaEventCreate(&done));
|
||||
dim3 thread_tail(DEFAULT_THREAD_PER_DIM, DEFAULT_THREAD_PER_DIM);
|
||||
// x dimension is for utterances
|
||||
// y dimention is for phone sequence in each utterance
|
||||
// Ensure that we allocate correct number of blocks for given number of utterances and max number of phones in those utterances
|
||||
dim3 block_tail((uttNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM, (maxPhoneNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM);
|
||||
for (long t = 0; t < maxFrameNum; t++)
|
||||
{
|
||||
_assignAlphaScore << <block_tail, thread_tail, 0, t_stream >> >(prob.Data(), alpha.Data(), phoneSeq.Data(), phoneBoundary.Data(), gpuUttToChanInd,
|
||||
gpuFrameNum, gpuBeginFrame, gpuPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, delayConstraint);
|
||||
}
|
||||
|
||||
for (long t = maxFrameNum - 1; t >= 0; t--)
|
||||
{
|
||||
_assignBetaScore << <block_tail, thread_tail, 0, t_stream >> >(prob.Data(), beta.Data(), phoneSeq.Data(), phoneBoundary.Data(), gpuUttToChanInd,
|
||||
gpuFrameNum, gpuBeginFrame, gpuPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, delayConstraint);
|
||||
}
|
||||
|
||||
_assignTotalScore << <uttNum, 1, 0, t_stream >> > (beta.Data(), gpuScores, uttNum, gpuUttToChanInd, gpuBeginFrame, numParallelSequences, maxPhoneNum);
|
||||
|
||||
dim3 block_tail_2((uttNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM, (maxFrameNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM);
|
||||
|
||||
_assignCTCScore << < block_tail_2, thread_tail, 0, t_stream >> >(Data(), prob.Data(), alpha.Data(), beta.Data(), phoneSeq.Data(), uttNum, gpuUttToChanInd,
|
||||
gpuBeginFrame, gpuPhoneNum, gpuFrameNum, numParallelSequences, maxPhoneNum, totalPhoneNum);
|
||||
|
||||
vector<ElemType>scores(uttNum);
|
||||
CUDA_CALL(cudaMemcpyAsync(scores.data(), gpuScores, sizeof(ElemType) * uttNum, cudaMemcpyDeviceToHost, t_stream));
|
||||
|
||||
for (size_t utt = 0; utt < uttFrameNum.size(); utt++)
|
||||
{
|
||||
totalScore += scores[utt];
|
||||
}
|
||||
|
||||
CUDA_CALL(cudaFree(gpuFrameNum));
|
||||
CUDA_CALL(cudaFree(gpuPhoneNum));
|
||||
CUDA_CALL(cudaFree(gpuBeginFrame));
|
||||
CUDA_CALL(cudaFree(gpuUttToChanInd));
|
||||
CUDA_CALL(cudaFree(gpuScores));
|
||||
|
||||
CUDA_CALL(cudaEventRecord(done));
|
||||
CUDA_CALL(cudaEventSynchronize(done));
|
||||
CUDA_CALL(cudaEventDestroy(done));
|
||||
}
|
||||
else
|
||||
{
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::ConductRowElementMultiplyWithShift(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c, const size_t shift, const bool isafixed)
|
||||
{
|
||||
|
@ -4613,8 +4742,8 @@ template GPUMatrix<char>::GPUMatrix(const GPUMatrix<char>&);
|
|||
template GPUMatrix<char>::GPUMatrix(GPUMatrix<char>&&);
|
||||
template char* GPUMatrix<char>::CopyToArray() const;
|
||||
template void GPUMatrix<char>::ChangeDeviceTo(int);
|
||||
template void GPUMatrix<char>::Resize(size_t, size_t, bool, bool);
|
||||
template void GPUMatrix<char>::RequireSize(size_t, size_t, bool, bool);
|
||||
template void GPUMatrix<char>::Resize(size_t, size_t, bool);
|
||||
template void GPUMatrix<char>::RequireSize(size_t, size_t, bool);
|
||||
|
||||
template GPUMatrix<char>::~GPUMatrix();
|
||||
template GPUMatrix<char> GPUMatrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
|
||||
|
@ -4638,8 +4767,8 @@ template GPUMatrix<short>::GPUMatrix(const GPUMatrix<short>&);
|
|||
template GPUMatrix<short>::GPUMatrix(GPUMatrix<short>&&);
|
||||
template short* GPUMatrix<short>::CopyToArray() const;
|
||||
template void GPUMatrix<short>::ChangeDeviceTo(int);
|
||||
template void GPUMatrix<short>::Resize(size_t, size_t, bool, bool);
|
||||
template void GPUMatrix<short>::RequireSize(size_t, size_t, bool, bool);
|
||||
template void GPUMatrix<short>::Resize(size_t, size_t, bool);
|
||||
template void GPUMatrix<short>::RequireSize(size_t, size_t, bool);
|
||||
|
||||
template GPUMatrix<short>::~GPUMatrix();
|
||||
template GPUMatrix<short> GPUMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
|
||||
|
|
|
@ -244,12 +244,12 @@ public:
|
|||
// RequireSize is now the new preferred method of ensuring the correct size inside of the Matrix class. Since Resize will fail if the storage object has
|
||||
// multiple views, RequireSize will first check to see if Resize is required. If it is not, then it short-circuits and is a noop. Otherwise, RequireSize
|
||||
// will call Resize, which may fail if the matrix has multiple views.
|
||||
void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true, bool cachedResize = false); // by default we only reallocate if need to grow
|
||||
void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true, bool cachedResize = false) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly, cachedResize); }
|
||||
void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
|
||||
void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly); }
|
||||
|
||||
// Resize first checks to ensure that the caller has the authority to call Resize (i.e., it checks to ensure the underlying data is owned by only this matrix), and then
|
||||
// actually resizes the underlying matrix, doing any allocation as required.
|
||||
void Resize(const size_t numRows, const size_t numCols, bool growOnly = true, bool cachedResize = false); // by default we only reallocate if need to grow
|
||||
void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
|
||||
|
||||
ElemType& operator()(const size_t /*row*/, const size_t /*col*/) { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
|
||||
const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
|
||||
|
@ -349,6 +349,10 @@ public:
|
|||
GPUMatrix<ElemType>& DropFrame(const GPUMatrix<ElemType>& label, const GPUMatrix<ElemType>& gamma, const ElemType& threshhold);
|
||||
GPUMatrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const GPUMatrix<ElemType>& label, const GPUMatrix<ElemType>& dnnoutput, const GPUMatrix<ElemType>& gamma, ElemType alpha);
|
||||
|
||||
GPUMatrix<ElemType>& AssignCTCScore(const GPUMatrix<ElemType>& prob, GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& beta,
|
||||
const GPUMatrix<ElemType> phoneSeq, const GPUMatrix<ElemType> phoneBoundary, ElemType &totalScore, const vector<size_t>& uttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum,
|
||||
const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise);
|
||||
|
||||
GPUMatrix<ElemType>& InplaceSqrt();
|
||||
GPUMatrix<ElemType>& AssignSqrtOf(const GPUMatrix<ElemType>& a);
|
||||
|
||||
|
|
|
@ -5192,6 +5192,292 @@ __global__ void _adam4BlockSparseCol(CUDA_LONG size,
|
|||
val[idx] -= g;
|
||||
}
|
||||
}
|
||||
|
||||
// calculate alpha in forward-backward calculation. equation (6), (7) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
|
||||
// Calculate alpha in forward-backward calculation. equation (6), (7) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
|
||||
// GPU x dimension corresponds to utterances, y dimension corresponds to phone sequence in each utterance
|
||||
// prob (input): the posterior output from the network
|
||||
// alpha (output): alpha for forward-backward calculation.
|
||||
// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance
|
||||
// phoneBound (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance
|
||||
// uttToChanInd (input): map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
|
||||
// uttFrameNum (input): the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch
|
||||
// uttBeginFrame(input): the positon of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance.
|
||||
// uttPhoneNum (input): the phone number of each utterance. The size of this vector = the number of all utterances in this minibatch
|
||||
// numChannels (input): channel number in this minibatch
|
||||
// uttNum (input): number of utterances
|
||||
// t (input): time stamp to process
|
||||
// maxPhoneNum (input): the max number of phones between utterances
|
||||
// totalPhoneNum (input): the total number of phones of all utterances
|
||||
// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference.
|
||||
// Alpha and Beta scores outside of the delay boundary are set to zero.
|
||||
// Setting this parameter smaller will result in shorted delay between label output during decoding.
|
||||
// delayConstraint=-1 means no constraint
|
||||
template<class ElemType>
|
||||
__global__ void _assignAlphaScore(
|
||||
const ElemType *prob,
|
||||
ElemType *alphaScore,
|
||||
ElemType *phoneSeq,
|
||||
ElemType *phoneBound,
|
||||
const size_t *uttToChanInd,
|
||||
const size_t *uttFrameNum,
|
||||
const size_t *uttBeginFrame,
|
||||
const size_t *uttPhoneNum,
|
||||
size_t numChannels,
|
||||
const size_t uttNum,
|
||||
const size_t t,
|
||||
const size_t maxPhoneNum, // Maximum length of utterance in this MB
|
||||
const size_t totalPhoneNum, // Total number of phones
|
||||
const int delayConstraint)
|
||||
{
|
||||
LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
// Index of the label in the sequence
|
||||
LONG64 phoneSeqId = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
// Number of phones and frames in this utterance
|
||||
LONG64 phoneNum = uttPhoneNum[uttId];
|
||||
LONG64 frameNum = uttFrameNum[uttId];
|
||||
|
||||
if (uttId >= uttNum || phoneSeqId >= phoneNum - 1 || t >= frameNum || phoneSeqId == 0) return;
|
||||
|
||||
// Current and previous phone indices in phoneSeq matrix
|
||||
LONG64 labelid = uttId*maxPhoneNum + phoneSeqId;
|
||||
LONG64 labelid_2 = labelid - 2;
|
||||
|
||||
// Actual current phone label
|
||||
LONG64 phoneId = (LONG64)(phoneSeq[labelid]);
|
||||
|
||||
// Index of the current frame in minibatch
|
||||
LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];
|
||||
|
||||
// Index of probability of observing phoneId at frame timeId
|
||||
LONG64 probId = timeId*totalPhoneNum + phoneId;
|
||||
|
||||
LONG64 alphaId = maxPhoneNum* timeId + phoneSeqId; // alpha_t(s)
|
||||
// Previous time frame
|
||||
LONG64 timeId_1 = timeId - numChannels; // Index corresponding to (t-1)
|
||||
LONG64 alphaId_0 = maxPhoneNum* timeId_1 + phoneSeqId; // alpha_{t-1}(s)
|
||||
LONG64 alphaId_1 = alphaId_0 - 1; // alpha_{t-1}(s-1)
|
||||
LONG64 alphaId_2 = alphaId_0 - 2; // alpha_{t-1}(s-2)
|
||||
|
||||
if (t == 0)
|
||||
{
|
||||
// Initialize recursion
|
||||
if (phoneSeqId == 1 || phoneSeqId == 2)
|
||||
{
|
||||
alphaScore[alphaId] = prob[probId];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (phoneSeqId >= 1)
|
||||
{
|
||||
ElemType x = LZERO;
|
||||
|
||||
ElemType ascore;
|
||||
if (phoneSeqId > 2)
|
||||
{
|
||||
// if current label is not blank and not equal prev non-blank label
|
||||
if ((LONG64)(phoneSeq[labelid]) != totalPhoneNum - 1 && phoneId != (LONG64)(phoneSeq[labelid_2]))
|
||||
{
|
||||
x = logaddk(x, alphaScore[alphaId_2]);
|
||||
}
|
||||
}
|
||||
|
||||
if (phoneSeqId > 1)
|
||||
{
|
||||
x = logaddk(x, alphaScore[alphaId_1]);
|
||||
}
|
||||
|
||||
x = logaddk(x, alphaScore[alphaId_0]);
|
||||
|
||||
if (phoneId != SIZE_MAX)
|
||||
ascore = prob[probId]; // Probability of observing given label at given time
|
||||
else
|
||||
ascore = 0;
|
||||
alphaScore[alphaId] = (ElemType)x + ascore;
|
||||
if (delayConstraint != -1)
|
||||
{
|
||||
LONG64 labelid_r = labelid + 2;
|
||||
LONG64 phoneBoundId_r = (LONG64)(phoneBound[labelid_r]);
|
||||
if (phoneId == totalPhoneNum - 1)
|
||||
{
|
||||
// only constraint right side
|
||||
if (t > phoneBoundId_r + delayConstraint - 1)
|
||||
alphaScore[alphaId] = LZERO;
|
||||
}
|
||||
else if (phoneId != totalPhoneNum - 1)
|
||||
{
|
||||
if (t > phoneBoundId_r + delayConstraint)
|
||||
alphaScore[alphaId] = LZERO;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate beta in forward-backward calculation, equation (10), (11) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
|
||||
// See _assignAlphaScore for the explanation of parameters
|
||||
template<class ElemType>
|
||||
__global__ void _assignBetaScore(
|
||||
const ElemType *prob,
|
||||
ElemType *betaScore,
|
||||
ElemType *phoneSeq,
|
||||
ElemType *phoneBound,
|
||||
const size_t *uttToChanInd,
|
||||
const size_t *uttFrameNum,
|
||||
const size_t *uttBeginFrame,
|
||||
const size_t *uttPhoneNum,
|
||||
const size_t numChannels,
|
||||
const size_t uttNum,
|
||||
const size_t t,
|
||||
const size_t maxPhoneNum,
|
||||
const size_t totalPhoneNum,
|
||||
const int delayConstraint)
|
||||
{
|
||||
LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
// Index of the label in the sequence
|
||||
LONG64 phoneSeqId = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
LONG64 phoneNum = uttPhoneNum[uttId];
|
||||
LONG64 frameNum = uttFrameNum[uttId];
|
||||
|
||||
if (uttId >= uttNum || phoneSeqId >= phoneNum - 1 || t >= frameNum || phoneSeqId == 0) return;
|
||||
|
||||
LONG64 labelid = uttId*maxPhoneNum + phoneSeqId;
|
||||
LONG64 labelid_2 = labelid + 2;
|
||||
LONG64 phoneId = (LONG64)(phoneSeq[labelid]);
|
||||
LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];
|
||||
LONG64 probId = timeId*totalPhoneNum + phoneId;
|
||||
LONG64 betaid = maxPhoneNum* timeId + phoneSeqId;
|
||||
LONG64 timeId_1 = timeId + numChannels;
|
||||
LONG64 betaid_0 = maxPhoneNum* timeId_1 + phoneSeqId;
|
||||
LONG64 betaid_1 = betaid_0 + 1;
|
||||
LONG64 betaid_2 = betaid_0 + 2;
|
||||
|
||||
if (t == frameNum - 1)
|
||||
{
|
||||
if (phoneSeqId == phoneNum - 3 || phoneSeqId == phoneNum - 2)
|
||||
{
|
||||
betaScore[betaid] = prob[probId];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (phoneSeqId >= 1)
|
||||
{
|
||||
ElemType x = LZERO;
|
||||
ElemType ascore;
|
||||
if (phoneSeqId < phoneNum - 3)
|
||||
{
|
||||
if (phoneSeq[labelid] != totalPhoneNum - 1 && phoneId != phoneSeq[labelid_2])
|
||||
{
|
||||
x = logaddk(x, betaScore[betaid_2]);
|
||||
}
|
||||
}
|
||||
|
||||
if (phoneSeqId < phoneNum - 2)
|
||||
{
|
||||
x = logaddk(x, betaScore[betaid_1]);
|
||||
}
|
||||
|
||||
x = logaddk(x, betaScore[betaid_0]);
|
||||
|
||||
if (phoneId != SIZE_MAX)
|
||||
ascore = prob[probId];
|
||||
else
|
||||
ascore = 0;
|
||||
betaScore[betaid] = (ElemType)x + ascore;
|
||||
if (delayConstraint != -1)
|
||||
{
|
||||
LONG64 phoneBoundId_r = (LONG64)(phoneBound[labelid_2]);
|
||||
if (phoneId == totalPhoneNum - 1)
|
||||
{
|
||||
if (t > phoneBoundId_r + delayConstraint - 1)
|
||||
betaScore[betaid] = LZERO;
|
||||
}
|
||||
else if (phoneId != totalPhoneNum - 1)
|
||||
{
|
||||
if (t > phoneBoundId_r + delayConstraint)
|
||||
betaScore[betaid] = LZERO;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate derivative, equation (15) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
|
||||
// See _assignAlphaScore for the explanation of parameters
|
||||
template<class ElemType>
|
||||
__global__ void _assignCTCScore(
|
||||
ElemType *CTCscore,
|
||||
ElemType *prob,
|
||||
ElemType *alphaScore,
|
||||
ElemType *betaScore,
|
||||
ElemType *phoneSeq,
|
||||
const size_t uttNum,
|
||||
const size_t *uttToChanInd,
|
||||
const size_t *uttBeginFrame,
|
||||
const size_t *uttPhoneNum,
|
||||
const size_t *uttFrameNum,
|
||||
const long numChannels,
|
||||
const long maxPhoneNum,
|
||||
const long totalPhoneNum)
|
||||
{
|
||||
LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
LONG64 t = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (uttId < uttNum && t < uttFrameNum[uttId])
|
||||
{
|
||||
LONG64 phoneNum = uttPhoneNum[uttId];
|
||||
LONG64 alphaId_0 = (uttBeginFrame[uttId] * numChannels + uttToChanInd[uttId]) * maxPhoneNum;
|
||||
LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];
|
||||
ElemType P_lx = betaScore[alphaId_0];
|
||||
|
||||
for (int s = 1; s < phoneNum - 1; s++)
|
||||
{
|
||||
long phoneId = phoneSeq[uttId*maxPhoneNum + s];
|
||||
LONG64 alphaId = maxPhoneNum* timeId + s;
|
||||
LONG64 probId = timeId*totalPhoneNum + phoneId;
|
||||
|
||||
if (phoneId != SIZE_MAX)
|
||||
{
|
||||
ElemType logoccu = alphaScore[alphaId] + betaScore[alphaId] - prob[probId] - (ElemType)P_lx;
|
||||
CTCscore[probId] = logaddk(CTCscore[probId], logoccu);
|
||||
}
|
||||
}
|
||||
|
||||
for (int s = 0; s < totalPhoneNum; s++)
|
||||
{
|
||||
LONG64 probId = timeId*totalPhoneNum + s;
|
||||
ElemType logoccu = CTCscore[probId];
|
||||
if (logoccu < LZERO)
|
||||
CTCscore[probId] = 0.0f;
|
||||
else
|
||||
CTCscore[probId] = exp(logoccu);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate CTC score. equation (8) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
|
||||
template<class ElemType>
|
||||
__global__ void _assignTotalScore(ElemType *betaScore,
|
||||
ElemType *totalScore,
|
||||
const size_t uttNum,
|
||||
const size_t *uttToChanInd,
|
||||
const size_t *uttBeginFrame,
|
||||
const size_t numChannels,
|
||||
const size_t maxPhoneNum)
|
||||
{
|
||||
LONG64 uttId = blockIdx.x;
|
||||
if (uttId < uttNum)
|
||||
{
|
||||
LONG64 alphaId_0 = (uttBeginFrame[uttId] * numChannels + uttToChanInd[uttId]) * maxPhoneNum;
|
||||
|
||||
betaScore[alphaId_0] = logaddk(betaScore[alphaId_0 + 1], betaScore[alphaId_0 + 2]);
|
||||
totalScore[uttId] = betaScore[alphaId_0];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -158,23 +158,6 @@ int GetMathLibTraceLevel()
|
|||
|
||||
MatrixBase::~MatrixBase() { }
|
||||
|
||||
#pragma region BufferManagement
|
||||
|
||||
std::unordered_map<DEVICEID_TYPE, std::unique_ptr<BufferManagement>> BufferManagement::m_instances;
|
||||
|
||||
template <>
|
||||
std::multimap<size_t, float*>& BufferManagement::BufferContainer<float>() { return m_bufferFloatContainer; }
|
||||
template <>
|
||||
std::multimap<size_t, double*>& BufferManagement::BufferContainer<double>() { return m_bufferDoubleContainer; }
|
||||
template <>
|
||||
std::multimap<size_t, char*>& BufferManagement::BufferContainer<char>() { return m_bufferCharContainer; }
|
||||
template <>
|
||||
std::multimap<size_t, short*>& BufferManagement::BufferContainer<short>() { return m_bufferShortContainer; }
|
||||
template <>
|
||||
std::multimap<size_t, int*>& BufferManagement::BufferContainer<int>() { return m_bufferIntContainer; }
|
||||
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Constructors, destructors and other static matrix builders
|
||||
|
||||
|
||||
|
@ -184,10 +167,6 @@ std::multimap<size_t, int*>& BufferManagement::BufferContainer<int>() { return m
|
|||
// { GPU code },
|
||||
// ...
|
||||
|
||||
// By default, the CachedMatrixBuffer is disable
|
||||
template <class ElemType>
|
||||
bool Matrix<ElemType>::m_useCachedResize = false;
|
||||
|
||||
// Initialize members
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::Init(DEVICEID_TYPE deviceId)
|
||||
|
@ -301,9 +280,6 @@ void Matrix<ElemType>::SetDataLocation(CurrentDataLocation location, MatrixType
|
|||
LogicError("SetDataLocation: New m_baseMatrix must not be NULL.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::UseCachedResizeOrNot(bool useCachedResize) { m_useCachedResize = useCachedResize; }
|
||||
|
||||
//this is a private constructor only used internally to initialize a blank matrix
|
||||
template <class ElemType>
|
||||
Matrix<ElemType>::Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID)
|
||||
|
@ -1829,7 +1805,7 @@ void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const
|
|||
// TODO: should this function test whether the size is changing, and skip if it isn't? We have at least one explicit test for this code calling this (recurrent node)
|
||||
DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(this,
|
||||
{ m_CPUMatrix->Resize(numRows, numCols, growOnly); },
|
||||
{ m_GPUMatrix->Resize(numRows, numCols, growOnly, m_useCachedResize); },
|
||||
{ m_GPUMatrix->Resize(numRows, numCols, growOnly); },
|
||||
{ m_CPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); },
|
||||
{ m_GPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); });
|
||||
#ifdef _DEBUG
|
||||
|
@ -5736,6 +5712,51 @@ Matrix<ElemType>& Matrix<ElemType>::AssignSequenceError(const ElemType hsmoothin
|
|||
NOT_IMPLEMENTED);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Calculate CTC score
|
||||
// prob (input): the posterior output from the network
|
||||
// alpha, beta (output): alpha and beta for forward-backward calculation.
|
||||
// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance
|
||||
// phoneBound (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance
|
||||
// totalScore (output): total CTC score
|
||||
// uttToChanInd (input): map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
|
||||
// uttBeginFrame(input): the positon of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance.
|
||||
// uttFrameNum (input): the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch
|
||||
// uttPhoneNum (input): the phone number of each utterance. The size of this vector = the number of all utterances in this minibatch
|
||||
// numParallelSequences (input): num of parallel sequences
|
||||
// mbsize (input): the maximum channel frame number
|
||||
// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference. This using the original time information to enforce that CTC tokens only get aligned within a time margin.
|
||||
// Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy.
|
||||
// delayConstraint=-1 means no constraint
|
||||
template<class ElemType>
|
||||
Matrix<ElemType>& Matrix<ElemType>::AssignCTCScore(const Matrix<ElemType>& prob, Matrix<ElemType>& alpha, Matrix<ElemType>& beta,
|
||||
const Matrix<ElemType>& phoneSeq, const Matrix<ElemType>& phoneBound, ElemType &totalScore, const std::vector<size_t> & uttToChanInd,
|
||||
const std::vector<size_t> & uttBeginFrame, const std::vector<size_t> & uttFrameNum, const std::vector<size_t> & uttPhoneNum,
|
||||
const size_t numParallelSequences, const size_t mbsize, const int delayConstraint, const bool isColWise)
|
||||
{
|
||||
DecideAndMoveToRightDevice(prob, *this);
|
||||
alpha.Resize(phoneSeq.GetNumRows(), prob.GetNumCols());
|
||||
beta.Resize(phoneSeq.GetNumRows(), prob.GetNumCols());
|
||||
Resize(prob.GetNumRows(), prob.GetNumCols());
|
||||
|
||||
alpha.SetValue(LZERO);
|
||||
beta.SetValue(LZERO);
|
||||
SetValue(LZERO);
|
||||
SwitchToMatrixType(prob.GetMatrixType(), prob.GetFormat(), false);
|
||||
|
||||
DISPATCH_MATRIX_ON_FLAG(&prob,
|
||||
this,
|
||||
this->m_CPUMatrix->AssignCTCScore(*prob.m_CPUMatrix, *alpha.m_CPUMatrix, *beta.m_CPUMatrix, *phoneSeq.m_CPUMatrix, *phoneBound.m_CPUMatrix, totalScore,
|
||||
uttToChanInd, uttBeginFrame, uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, delayConstraint, isColWise),
|
||||
this->m_GPUMatrix->AssignCTCScore(*prob.m_GPUMatrix, *alpha.m_GPUMatrix, *beta.m_GPUMatrix, *phoneSeq.m_GPUMatrix, *phoneBound.m_GPUMatrix, totalScore,
|
||||
uttToChanInd, uttBeginFrame, uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, delayConstraint, isColWise),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED
|
||||
);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
#pragma endregion Static BLAS Functions
|
||||
|
||||
// TensorView currently does not interface with sparse matrices. For now, we just catch this and throw.
|
||||
|
|
|
@ -87,9 +87,6 @@ private:
|
|||
mutable size_t m_numTimesMatrixTypeChanged;
|
||||
mutable int m_devicesTransferedTo[2]; // TODO: what is this for? Seems only diagnostics
|
||||
|
||||
// whether to use cached memory Resize() or not
|
||||
static bool m_useCachedResize;
|
||||
|
||||
// Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
|
||||
void _transferFromDeviceToDevice(int id_from, int id_to, bool isBeingMoved = true, bool emptyTransfer = false) const;
|
||||
// Moves matrix from current device to device with id_to. This method doesn't change preferred device Id
|
||||
|
@ -143,8 +140,6 @@ public:
|
|||
SetDataLocation(GetDeviceId() < 0 ? CurrentDataLocation::CPU : CurrentDataLocation::GPU, GetMatrixType());
|
||||
}
|
||||
|
||||
static void UseCachedResizeOrNot(bool useCachedResize);
|
||||
|
||||
private:
|
||||
Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); // only used internally to initialize a blank matrix
|
||||
Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID); // only used internally to initialize a blank matrix
|
||||
|
@ -382,6 +377,11 @@ public:
|
|||
// sequence training
|
||||
Matrix<ElemType>& DropFrame(const Matrix<ElemType>& label, const Matrix<ElemType>& gamma, const ElemType& threshhold);
|
||||
Matrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const Matrix<ElemType>& label, const Matrix<ElemType>& dnnoutput, const Matrix<ElemType>& gamma, ElemType alpha);
|
||||
|
||||
Matrix<ElemType>& AssignCTCScore(const Matrix<ElemType>& prob, Matrix<ElemType>& alpha, Matrix<ElemType>& beta, const Matrix<ElemType>& phoneSeq, const Matrix<ElemType>& phoneBound, ElemType &totalScore,
|
||||
const vector<size_t> & extraUttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep,
|
||||
const size_t mbSize, const int delayConstraint, const bool isColWise);
|
||||
|
||||
Matrix<ElemType>& InplaceSqrt();
|
||||
Matrix<ElemType>& AssignSqrtOf(const Matrix<ElemType>& a);
|
||||
|
||||
|
|
|
@ -26,10 +26,8 @@ NcclComm::NcclComm(int deviceId, const MPIWrapperPtr& mpi)
|
|||
return;
|
||||
|
||||
size_t numRanks = mpi->NumNodesInUse();
|
||||
MPI_Comm mpiComm = mpi->Communicator();
|
||||
std::vector<int> allDevs(numRanks);
|
||||
MPI_Allgather(&deviceId, 1, MPI_INT, allDevs.data(), 1, MPI_INT, mpiComm)
|
||||
|| MpiFail("NcclComm: MPI_Allgather");
|
||||
mpi->Allgather(&deviceId, 1, MPI_INT, allDevs.data(), 1, MPI_INT);
|
||||
|
||||
for (size_t r = 0; r<numRanks; r++)
|
||||
{
|
||||
|
@ -53,8 +51,7 @@ NcclComm::NcclComm(int deviceId, const MPIWrapperPtr& mpi)
|
|||
if (res != ncclSuccess)
|
||||
RuntimeError("NcclComm failed to obtain ncclUniqueId: %s", ncclGetErrorString(res));
|
||||
|
||||
MPI_Bcast(&ncclId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, mpiComm)
|
||||
|| MpiFail("NcclComm: MPI_Bcase");
|
||||
mpi->Bcast(&ncclId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0);
|
||||
|
||||
PrepareDevice(deviceId);
|
||||
res = ncclCommInitRank(&m_ncclComm, numRanks, ncclId, mpi->CurrentNodeRank());
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче