This commit is contained in:
Frank Seide 2016-07-19 16:40:51 -07:00
Родитель 078e30e6ca 234662c008
Коммит 39a9175097
213 изменённых файлов: 100670 добавлений и 501140 удалений

Просмотреть файл

@ -27,6 +27,11 @@
<CudaVersion Condition="Exists('$(CUDA_PATH_V7_5)') And '$(CudaVersion)' == ''">7.5</CudaVersion>
<CudaVersion Condition="Exists('$(CUDA_PATH_V7_0)') And '$(CudaVersion)' == ''">7.0</CudaVersion>
<HasOpenCv>false</HasOpenCv>
<HasOpenCv Condition="Exists('$(OPENCV_PATH)') Or Exists('$(OPENCV_PATH_V31)')">true</HasOpenCv>
<UseZip>false</UseZip>
<UseZip Condition="Exists('$(ZLIB_PATH)')">true</UseZip>
</PropertyGroup>
<Choose>
@ -70,7 +75,33 @@
<UnitTestDlls>$(OutDir)mkl_cntk_s.dll;</UnitTestDlls>
</PropertyGroup>
</When>
</Choose>
</Choose>
<PropertyGroup Condition="$(UseZip)">
<ZipInclude>$(ZLIB_PATH)\include;$(ZLIB_PATH)\lib\libzip\include;</ZipInclude>
<ZipDefine>USE_ZIP</ZipDefine>
<ZipLibPath>$(ZLIB_PATH)\lib;</ZipLibPath>
<ZipLibs>zlib.lib;zip.lib;</ZipLibs>
</PropertyGroup>
<PropertyGroup Condition="Exists('$(OPENCV_PATH)')">
<OpenCvPath>$(OPENCV_PATH)</OpenCvPath>
<OpenCvVersion>300</OpenCvVersion>
</PropertyGroup>
<PropertyGroup Condition="Exists('$(OPENCV_PATH_V31)')">
<OpenCvPath>$(OPENCV_PATH_V31)</OpenCvPath>
<OpenCvVersion>310</OpenCvVersion>
</PropertyGroup>
<PropertyGroup Condition="$(HasOpenCv)">
<OpenCvInclude>$(OpenCvPath)\include;</OpenCvInclude>
<OpenCvWorld Condition="$(ReleaseBuild)">opencv_world$(OpenCvVersion)</OpenCvWorld>
<OpenCvWorld Condition="$(DebugBuild)">opencv_world$(OpenCvVersion)d</OpenCvWorld>
<OpenCvLib>$(OpenCvWorld).lib</OpenCvLib>
<OpenCvLibPath>$(OpenCvPath)\x64\vc12\lib</OpenCvLibPath>
<OpenCvBinPath>$(OpenCvPath)\x64\vc12\bin</OpenCvBinPath>
</PropertyGroup>
<PropertyGroup Condition="'$(CudaVersion)' == '7.5'">
<CudaPath>$(CUDA_PATH_V7_5)</CudaPath>

Просмотреть файл

@ -685,11 +685,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "EvalWrapper", "Source\Exten
{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
EndProjectSection
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CSEvalClient", "Source\Extensibility\CSEvalClient\CSEvalClient.csproj", "{41E11A59-62B2-4927-A4F8-F40B1B612C6C}"
ProjectSection(ProjectDependencies) = postProject
{EF766CAE-9CB1-494C-9153-0030631A6340} = {EF766CAE-9CB1-494C-9153-0030631A6340}
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Examples", "Examples", "{BD46CE02-3740-4526-80F6-CC7973B953E5}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Image", "Image", "{FC7E7EC7-6E6A-4518-81C6-DA60451C657A}"
@ -1051,8 +1046,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelBM", "ParallelBM",
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SequenceToSequence", "SequenceToSequence", "{A1521DC4-C8EC-47BD-9E63-7BE30ED2EC26}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClient", "Source\Extensibility\CPPEvalClient\CPPEvalClient.vcxproj", "{578D52A0-3928-4405-A016-F016E8B49031}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "HtkDeserializers", "HtkDeserializers", "{977ECCB7-598D-4548-B95B-BACA9CC7D98B}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "DNN", "DNN", "{1DBB2575-F5C8-43F4-B982-D05D6ADC2F9B}"
@ -1140,6 +1133,18 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Scripts", "Scripts", "{6826
ProjectSection(SolutionItems) = preProject
Scripts\pytest.ini = Scripts\pytest.ini
Scripts\txt2ctf.py = Scripts\txt2ctf.py
Scripts\uci2ctf.py = Scripts\uci2ctf.py
EndProjectSection
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ManagedEvalTests", "Tests\UnitTests\ManagedEvalTests\ManagedEvalTests.csproj", "{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Evaluation", "Evaluation", "{3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CSEvalClient", "Examples\Evaluation\CSEvalClient\CSEvalClient.csproj", "{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClient", "Examples\Evaluation\CPPEvalClient\CPPEvalClient.vcxproj", "{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}"
ProjectSection(ProjectDependencies) = postProject
{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
EndProjectSection
EndProject
Global
@ -1308,14 +1313,6 @@ Global
{EF766CAE-9CB1-494C-9153-0030631A6340}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{EF766CAE-9CB1-494C-9153-0030631A6340}.Release|x64.ActiveCfg = Release|x64
{EF766CAE-9CB1-494C-9153-0030631A6340}.Release|x64.Build.0 = Release|x64
{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Debug|x64.ActiveCfg = Debug|x64
{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Debug|x64.Build.0 = Debug|x64
{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Release|x64.ActiveCfg = Release|x64
{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Release|x64.Build.0 = Release|x64
{F0A9637C-20DA-42F0-83D4-23B4704DE602}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
{F0A9637C-20DA-42F0-83D4-23B4704DE602}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
{F0A9637C-20DA-42F0-83D4-23B4704DE602}.Debug|x64.ActiveCfg = Debug|x64
@ -1372,14 +1369,6 @@ Global
{7B7A563D-AA8E-4660-A805-D50235A02120}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{7B7A563D-AA8E-4660-A805-D50235A02120}.Release|x64.ActiveCfg = Release|x64
{7B7A563D-AA8E-4660-A805-D50235A02120}.Release|x64.Build.0 = Release|x64
{578D52A0-3928-4405-A016-F016E8B49031}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
{578D52A0-3928-4405-A016-F016E8B49031}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
{578D52A0-3928-4405-A016-F016E8B49031}.Debug|x64.ActiveCfg = Debug|x64
{578D52A0-3928-4405-A016-F016E8B49031}.Debug|x64.Build.0 = Debug|x64
{578D52A0-3928-4405-A016-F016E8B49031}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
{578D52A0-3928-4405-A016-F016E8B49031}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{578D52A0-3928-4405-A016-F016E8B49031}.Release|x64.ActiveCfg = Release|x64
{578D52A0-3928-4405-A016-F016E8B49031}.Release|x64.Build.0 = Release|x64
{82125DA1-1CD7-45B5-9281-E6AE7C287CB7}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
{82125DA1-1CD7-45B5-9281-E6AE7C287CB7}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
{82125DA1-1CD7-45B5-9281-E6AE7C287CB7}.Debug|x64.ActiveCfg = Debug|x64
@ -1412,6 +1401,30 @@ Global
{F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.ActiveCfg = Release|x64
{F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.Build.0 = Release|x64
{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Debug|x64.ActiveCfg = Debug|x64
{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Debug|x64.Build.0 = Debug|x64
{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Release|x64.ActiveCfg = Release|x64
{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Release|x64.Build.0 = Release|x64
{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Debug|x64.ActiveCfg = Debug|x64
{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Debug|x64.Build.0 = Debug|x64
{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Release|x64.ActiveCfg = Release|x64
{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Release|x64.Build.0 = Release|x64
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Debug|x64.ActiveCfg = Debug|x64
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Debug|x64.Build.0 = Debug|x64
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release|x64.ActiveCfg = Release|x64
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@ -1502,7 +1515,6 @@ Global
{E6DC3B7D-303D-4A54-B040-D8DCF8C56E17} = {8C128B1D-87E0-4643-AB93-2581589AE425}
{06D2C644-AE5F-4C30-A1F6-C78E2845AAB1} = {EF710C5A-E616-442A-889D-C997D39AF2E1}
{EF766CAE-9CB1-494C-9153-0030631A6340} = {60F87E25-BC87-4782-8E20-1621AAEBB113}
{41E11A59-62B2-4927-A4F8-F40B1B612C6C} = {60F87E25-BC87-4782-8E20-1621AAEBB113}
{BD46CE02-3740-4526-80F6-CC7973B953E5} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5}
{FC7E7EC7-6E6A-4518-81C6-DA60451C657A} = {BD46CE02-3740-4526-80F6-CC7973B953E5}
{CEADE942-4077-4577-ACF9-41C04388DDC0} = {BD46CE02-3740-4526-80F6-CC7973B953E5}
@ -1552,7 +1564,6 @@ Global
{4D6F731C-4A6D-4E21-AC3C-9E1F26E5547E} = {6994C86D-A672-4254-824A-51F4DFEB807F}
{36C42845-0D48-4A46-9C67-2B593A80A09C} = {6994C86D-A672-4254-824A-51F4DFEB807F}
{A1521DC4-C8EC-47BD-9E63-7BE30ED2EC26} = {47755F2E-D674-4175-9E38-8EA053455072}
{578D52A0-3928-4405-A016-F016E8B49031} = {60F87E25-BC87-4782-8E20-1621AAEBB113}
{977ECCB7-598D-4548-B95B-BACA9CC7D98B} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
{1DBB2575-F5C8-43F4-B982-D05D6ADC2F9B} = {977ECCB7-598D-4548-B95B-BACA9CC7D98B}
{772A0DB3-4710-4281-8AA9-A9F1F7C543D3} = {977ECCB7-598D-4548-B95B-BACA9CC7D98B}
@ -1568,5 +1579,9 @@ Global
{731312A8-6DA3-4841-AFCD-57520BA1BF8E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
{E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
{F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
{3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA} = {47755F2E-D674-4175-9E38-8EA053455072}
{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
EndGlobalSection
EndGlobal

Просмотреть файл

@ -863,38 +863,27 @@ The dimension reduced matrix consisting of the maximum value within each pooling
This function is often associated with Convolution() operations.
### Delay
### PastValue, FutureValue
Delay node used in recurrent networks, allows creation of a loop in the convolutional network that will repeat a specified number of times.
PastValue and FutureValue nodes are used in recurrent networks, allow creation of a loop in the convolutional network that will repeat a specified number of times. PastValue retrieves the value of a node several steps away in the past, while FutureValue retrieves the value of a node from future.
`Delay(rows, [cols], delayNode, delayTime=1, needGradient=true, defaultHiddenActivity=0.1)`
`PastValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`
`FutureValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`
#### Parameters
`cvweight` – convolution weight matrix, it has the dimensions of \[outputChannels, kernelWidth \* kernelHeight \* inputChannels\]
`rows` – number of rows in the node
`kernelWidth` – width of the kernel
`cols` – number of cols in the node. This value is often ommit since the length of a sequence varies
`kernelHeight` – height of the kernel
`timeStep` – \[default = 1\] number of time steps toward the past and future
`outputChannels` – number of output channels
`horizontalSubsample` – subsamples in the horizontal direction
`verticalSubsample` – subsamples in the vertical direction
#### Optional Parameters
`delayTime` – \[default = 1\] the amount of delay that will be introduced (number of times the loop will happen)
`needGradient` – \[default = true\] does the gradient need to be computed for this node
`defaultHiddenActivity` – \[default = 0.1\] the numerical amount for the defaultHiddenActivity
`defaultHiddenActivity` – \[default = 0.1\] default value to use when passing the sequence bounday or when the value is missing.
#### Returns
The results of the completed Delay loop
Either the past or future value of a node
#### Notes
This node is used in recurrent networks, where a delay is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network that will repeat delayTime number of iterations.
This node is used in recurrent networks, where a past value is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network.

Просмотреть файл

@ -4,9 +4,10 @@
//
// CPPEvalClient.cpp : Sample application using the evaluation interface from C++
//
#include "stdafx.h"
#include "eval.h"
#include "Eval.h"
#ifdef _WIN32
#include "Windows.h"
#endif
using namespace Microsoft::MSR::CNTK;
@ -23,41 +24,38 @@ typedef std::map<std::wstring, std::vector<float>*> Layer;
/// <description>
/// This program is a native C++ client using the native evaluation interface
/// located in the <see cref="eval.h"/> file.
/// The CNTK evaluation dll (EvalDLL.dll), must be found through the system's path.
/// The CNTK evaluation library (EvalDLL.dll on Windows, and LibEval.so on Linux), must be found through the system's path.
/// The other requirement is that Eval.h be included
/// In order to run this program the model must already exist in the example. To create the model,
/// first run the example in <CNTK>/Examples/Image/MNIST. Once the model file 01_OneHidden is created,
/// you can run this client.
/// This program demonstrates the usage of the Evaluate method requiring the input and output layers as parameters.
int _tmain(int argc, _TCHAR* argv[])
int main(int argc, char* argv[])
{
// Get the binary path (current working directory)
argc = 0;
std::wstring wapp(argv[0]);
std::string app(wapp.begin(), wapp.end());
std::string path = app.substr(0, app.rfind("\\"));
// Load the eval library
auto hModule = LoadLibrary(L"evaldll.dll");
if (hModule == nullptr)
{
const std::wstring msg(L"Cannot find evaldll.dll library");
const std::string ex(msg.begin(), msg.end());
throw new std::exception(ex.c_str());
}
// Get the factory method to the evaluation engine
std::string func = "GetEvalF";
auto procAddress = GetProcAddress(hModule, func.c_str());
auto getEvalProc = (GetEvalProc<float>)procAddress;
// Native model evaluation instance
argc = 0;
std::string app = argv[0];
std::string path;
IEvaluateModel<float> *model;
getEvalProc(&model);
size_t pos;
// This relative path assumes launching from CNTK's binary folder
const std::string modelWorkingDirectory = path + "\\..\\..\\Examples\\Image\\MNIST\\Data\\";
const std::string modelFilePath = modelWorkingDirectory + "..\\Output\\Models\\01_OneHidden";
#ifdef _WIN32
pos = app.rfind("\\");
path = (pos == std::string::npos) ? "." : app.substr(0, pos);
// This relative path assumes launching from CNTK's binary folder, e.g. x64\Release
const std::string modelWorkingDirectory = path + "/../../Examples/Image/MNIST/Data/";
#else // on Linux
pos = app.rfind("/");
path = (pos == std::string::npos) ? "." : app.substr(0, pos);
// This relative path assumes launching from CNTK's binary folder, e.g. build/release/bin/
const std::string modelWorkingDirectory = path + "/../../../Examples/Image/MNIST/Data/";
#endif
GetEvalF(&model);
const std::string modelFilePath = modelWorkingDirectory + "../Output/Models/01_OneHidden";
// Load model with desired outputs
std::string networkConfiguration;
@ -97,7 +95,7 @@ int _tmain(int argc, _TCHAR* argv[])
// Output the results
fprintf(stderr, "Layer '%ls' output:\n", outputLayerName.c_str());
for each (auto& value in outputs)
for (auto& value : outputs)
{
fprintf(stderr, "%f\n", value);
}

Просмотреть файл

@ -19,7 +19,7 @@
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{578D52A0-3928-4405-A016-F016E8B49031}</ProjectGuid>
<ProjectGuid>{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>CPPEvalClient</RootNamespace>
</PropertyGroup>
@ -69,7 +69,7 @@
<AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
<DelayLoadDLLs>%(DelayLoadDLLs)</DelayLoadDLLs>
<Profile>true</Profile>
</Link>
@ -104,15 +104,8 @@
<LinkLibraryDependencies>true</LinkLibraryDependencies>
</ProjectReference>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="stdafx.h" />
<ClInclude Include="targetver.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="CPPEvalClient.cpp" />
<ClCompile Include="stdafx.cpp">
<PrecompiledHeader>Create</PrecompiledHeader>
</ClCompile>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />

Просмотреть файл

@ -15,17 +15,6 @@
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="stdafx.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="targetver.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="stdafx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="CPPEvalClient.cpp">
<Filter>Source Files</Filter>
</ClCompile>

Просмотреть файл

@ -0,0 +1,84 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProjectGuid>{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}</ProjectGuid>
<OutputType>Exe</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient</RootNamespace>
<AssemblyName>CSEvalClient</AssemblyName>
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>..\..\..\x64\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x64</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>true</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug_CpuOnly|x64'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>..\..\..\x64\Debug_CpuOnly\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x64</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>true</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'">
<OutputPath>..\..\..\x64\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x64</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>true</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release_CpuOnly|x64'">
<OutputPath>..\..\..\x64\Release_CpuOnly\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x64</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>true</Prefer32Bit>
</PropertyGroup>
<ItemGroup>
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="Microsoft.CSharp" />
<Reference Include="System.Drawing" />
</ItemGroup>
<ItemGroup>
<Compile Include="CntkBitmapExtensions.cs" />
<Compile Include="ModelEvaluator.cs" />
<Compile Include="Program.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="App.config" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\Source\Extensibility\EvalWrapper\EvalWrapper.vcxproj">
<Project>{ef766cae-9cb1-494c-9153-0030631a6340}</Project>
<Name>EvalWrapper</Name>
</ProjectReference>
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

Просмотреть файл

@ -0,0 +1,212 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// CntkBitmapExtensions.cs -- extension methods for transforming images used in CNTK.
//
using System;
using System.Collections.Generic;
using System.Drawing;
using System.Drawing.Imaging;
using System.Linq;
using System.Runtime.InteropServices;
using System.Threading.Tasks;
namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
{
public static class CntkBitmapExtensions
{
/// <summary>
/// Resizes an image
/// </summary>
/// <param name="image">The image to resize</param>
/// <param name="width">New width in pixels</param>
/// <param name="height">New height in pixesl</param>
/// <param name="useHighQuality">Resize quality</param>
/// <returns>The resized image</returns>
public static Bitmap Resize(this Bitmap image, int width, int height, bool useHighQuality)
{
var newImg = new Bitmap(width, height);
newImg.SetResolution(image.HorizontalResolution, image.VerticalResolution);
using (var g = Graphics.FromImage(newImg))
{
g.CompositingMode = System.Drawing.Drawing2D.CompositingMode.SourceCopy;
if (useHighQuality)
{
g.InterpolationMode = System.Drawing.Drawing2D.InterpolationMode.HighQualityBicubic;
g.CompositingQuality = System.Drawing.Drawing2D.CompositingQuality.HighQuality;
g.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.HighQuality;
g.PixelOffsetMode = System.Drawing.Drawing2D.PixelOffsetMode.HighQuality;
}
else
{
g.InterpolationMode = System.Drawing.Drawing2D.InterpolationMode.Default;
g.CompositingQuality = System.Drawing.Drawing2D.CompositingQuality.Default;
g.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.Default;
g.PixelOffsetMode = System.Drawing.Drawing2D.PixelOffsetMode.Default;
}
var attributes = new ImageAttributes();
attributes.SetWrapMode(System.Drawing.Drawing2D.WrapMode.TileFlipXY);
g.DrawImage(image, new Rectangle(0, 0, width, height), 0, 0, image.Width, image.Height, GraphicsUnit.Pixel, attributes);
}
return newImg;
}
/// <summary>
/// Extracts image pixels in CHW
/// </summary>
/// <param name="image">The bitmap image to extract features from</param>
/// <returns>A list of pixels in HWC order</returns>
public static List<float> ExtractCHW(this Bitmap image)
{
var features = new List<float>(image.Width * image.Height * 3);
for (int c = 0; c < 3; c++)
{
for (int h = 0; h < image.Height; h++)
{
for (int w = 0; w < image.Width; w++)
{
var pixel = image.GetPixel(w, h);
float v = c == 0 ? pixel.B : c == 1 ? pixel.G : pixel.R;
features.Add(v);
}
}
}
return features;
}
/// <summary>
/// Extracts image pixels in CHW using parallelization
/// </summary>
/// <param name="image">The bitmap image to extract features from</param>
/// <returns>A list of pixels in CHW order</returns>
public static List<float> ParallelExtractCHW(this Bitmap image)
{
// We use local variables to avoid contention on the image object through the multiple threads.
int channelStride = image.Width * image.Height;
int imageWidth = image.Width;
int imageHeight = image.Height;
var features = new byte[imageWidth * imageHeight * 3];
var bitmapData = image.LockBits(new Rectangle(0, 0, imageWidth, imageHeight), ImageLockMode.ReadOnly, image.PixelFormat);
IntPtr ptr = bitmapData.Scan0;
int bytes = Math.Abs(bitmapData.Stride) * bitmapData.Height;
byte[] rgbValues = new byte[bytes];
int stride = bitmapData.Stride;
// Copy the RGB values into the array.
System.Runtime.InteropServices.Marshal.Copy(ptr, rgbValues, 0, bytes);
// The mapping depends on the pixel format
// The mapPixel lambda will return the right color channel for the desired pixel
Func<int, int, int, int> mapPixel = GetPixelMapper(image.PixelFormat, stride);
Parallel.For(0, imageHeight, (int h) =>
{
Parallel.For(0, imageWidth, (int w) =>
{
Parallel.For(0, 3, (int c) =>
{
features[channelStride * c + imageWidth * h + w] = rgbValues[mapPixel(h, w, c)];
});
});
});
image.UnlockBits(bitmapData);
return features.Select(b => (float)b).ToList();
}
/// <summary>
/// Extracts image pixels in HWC
/// </summary>
/// <param name="image">The bitmap image to extract features from</param>
/// <returns>A list of pixels in HWC order</returns>
public static List<float> ExtractHWC(this Bitmap image)
{
var features = new List<float>(image.Width * image.Height * 3);
for (int w = 0; w < image.Width; w++)
{
for (int h = 0; h < image.Height; h++)
{
for (int c = 0; c < 3; c++)
{
var pixel = image.GetPixel(w, h);
float v = c == 0 ? pixel.B : c == 1 ? pixel.G : pixel.R;
features.Add(v);
}
}
}
return features;
}
/// <summary>
/// Extracts image pixels in HWC using multiple threads
/// </summary>
/// <param name="image">The bitmap image to extract features from</param>
/// <returns>A list of pixels in HWC order</returns>
public static List<float> ParallelExtractHWC(this Bitmap image)
{
int heightStride = image.Width * 3;
int widthStride = image.Height * 3;
int imageWidth = image.Width;
int imageHeight = image.Height;
var features = new byte[image.Width * image.Height * 3];
var bitmapData = image.LockBits(new Rectangle(0, 0, image.Width, image.Height), ImageLockMode.ReadOnly, image.PixelFormat);
IntPtr ptr = bitmapData.Scan0;
int bytes = Math.Abs(bitmapData.Stride) * bitmapData.Height;
byte[] rgbValues = new byte[bytes];
int stride = bitmapData.Stride;
// Copy the RGB values into the array.
System.Runtime.InteropServices.Marshal.Copy(ptr, rgbValues, 0, bytes);
// The mapping depends on the pixel format
// The mapPixel lambda will return the right color channel for the desired pixel
Func<int, int, int, int> mapPixel = GetPixelMapper(image.PixelFormat, stride);
Parallel.For(0, 3, (int c) =>
{
Parallel.For(0, imageHeight, (int h) =>
{
Parallel.For(0, imageWidth, (int w) =>
{
features[w * widthStride + h * 3 + c] = rgbValues[mapPixel(h, w, c)];
});
});
});
image.UnlockBits(bitmapData);
return features.Select(b => (float)b).ToList();
}
/// <summary>
/// Returns a function for extracting the R-G-B values properly from an image based on its pixel format
/// </summary>
/// <param name="pixelFormat">The image's pixel format</param>
/// <param name="heightStride">The stride (row byte count)</param>
/// <returns>A function with signature (height, width, channel) returning the corresponding color value</returns>
private static Func<int, int, int, int> GetPixelMapper(PixelFormat pixelFormat, int heightStride)
{
switch (pixelFormat)
{
case PixelFormat.Format32bppArgb:
return (h, w, c) => h * heightStride + w * 4 + c; // bytes are B-G-R-A
case PixelFormat.Format24bppRgb:
default:
return (h, w, c) => h * heightStride + w * 3 + c; // bytes are B-G-R
}
}
}
}

Просмотреть файл

@ -0,0 +1,214 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// ModelEvaluator.cs -- wrapper for a network so it can be evaluated one call at a time.
//
// THIS CODE IS FOR ILLUSTRATION PURPOSES ONLY. NOT FOR PRODUCTION.
//
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
{
/// <summary>
/// This class provides an Eval model wrapper to restrict model evaluation calls to one at a time.
/// </summary>
/// <remarks>
/// This class is not thread-safe except through the static methods.
/// Each ModelEvaluator instance wraps an Eval model, and exposes the Evaluate method for either
/// a vector of inputs or a record string.
/// The static interface provides the management of the concurrency of the models and restricts
/// the evaluations to a single thread.
/// </remarks>
public sealed class ModelEvaluator
{
/// <summary>
/// The cntk model evaluation instance
/// </summary>
private readonly IEvaluateModelManagedF m_model;
/// <summary>
/// The input layer key
/// </summary>
private readonly string m_inKey;
/// <summary>
/// The output layer key
/// </summary>
private readonly string m_outKey;
/// <summary>
/// The model instance number
/// </summary>
private readonly int m_modelInstance;
/// <summary>
/// The input buffer
/// </summary>
private Dictionary<string, List<float>> m_inputs;
/// <summary>
/// Indicates if the object is diposed
/// </summary>
private static bool Disposed
{
get;
set;
}
/// <summary>
/// The ModelEvaluator's models to manage
/// </summary>
private static readonly BlockingCollection<ModelEvaluator> Models = new BlockingCollection<ModelEvaluator>();
/// <summary>
/// Initializes the Model Evaluator to process multiple models concurrently
/// </summary>
/// <param name="numConcurrentModels">The number of concurrent models</param>
/// <param name="modelFilePath">The model file path to load the model from</param>
/// <param name="numThreads"></param>
public static void Initialize(int numConcurrentModels, string modelFilePath, int numThreads = 1)
{
if (Disposed)
{
throw new CNTKRuntimeException("Model Evaluator has been disposed", string.Empty);
}
for (int i = 0; i < numConcurrentModels; i++)
{
Models.Add(new ModelEvaluator(modelFilePath, numThreads, i));
}
Disposed = false;
}
/// <summary>
/// Disposes of all models
/// </summary>
public static void DisposeAll()
{
Disposed = true;
foreach (var model in Models)
{
model.Dispose();
}
Models.Dispose();
}
/// <summary>
/// Evaluates a record containing the input data and the expected outcome value
/// </summary>
/// <param name="record">A tab-delimited string with the first entry being the expected value.</param>
/// <returns>true if the outcome is as expected, false otherwise</returns>
public static bool Evaluate(string record)
{
var model = Models.Take();
try
{
var outcome = model.EvaluateRecord(record);
return outcome;
}
finally
{
Models.Add(model);
}
}
/// <summary>
/// Evaluated a vector and returns the output vector
/// </summary>
/// <param name="inputs">The input vector</param>
/// <returns>The output vector</returns>
public static List<float> Evaluate(List<float> inputs)
{
var model = Models.Take();
try
{
var outcome = model.EvaluateInput(inputs);
return outcome;
}
finally
{
Models.Add(model);
}
}
/// <summary>
/// Creates an instance of the <see cref="ModelEvaluator"/> class.
/// </summary>
/// <param name="modelFilePath">The model file path</param>
/// <param name="numThreads">The number of concurrent threads for the model</param>
/// <param name="id">A unique id for the model</param>
/// <remarks>The id is used only for debugging purposes</remarks>
private ModelEvaluator(string modelFilePath, int numThreads, int id)
{
m_modelInstance = id;
m_model = new IEvaluateModelManagedF();
// Configure the model to run with a specific number of threads
m_model.Init(string.Format("numCPUThreads={0}", numThreads));
// Load model
m_model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId: -1);
// Generate random input values in the appropriate structure and size
var inDims = m_model.GetNodeDimensions(NodeGroup.Input);
m_inKey = inDims.First().Key;
m_inputs = new Dictionary<string, List<float>>() { { m_inKey, null } };
// We request the output layer names(s) and dimension, we'll use the first one.
var outDims = m_model.GetNodeDimensions(NodeGroup.Output);
m_outKey = outDims.First().Key;
}
/// <summary>
/// Evaluates a test record
/// </summary>
/// <param name="record">A tab-delimited string containing as the first entry the expected outcome, values after that are the input data</param>
/// <returns>true if the record's expected outcome value matches the computed value</returns>
private bool EvaluateRecord(string record)
{
// The first value in the line is the expected label index for the record's outcome
int expected = int.Parse(record.Substring(0, record.IndexOf('\t')));
m_inputs[m_inKey] =
record.Substring(record.IndexOf('\t') + 1).Split('\t').Select(float.Parse).ToList();
// We can call the evaluate method and get back the results (single layer)...
var outputs = m_model.Evaluate(m_inputs, m_outKey);
// Retrieve the outcome index (so we can compare it with the expected index)
int index = 0;
var max = outputs.Select(v => new { Value = v, Index = index++ })
.Aggregate((a, b) => (a.Value > b.Value) ? a : b)
.Index;
return (expected == max);
}
/// <summary>
/// Evaluates an input vector against the model as the first defined input layer, and returns the first defined output layer
/// </summary>
/// <param name="inputs">Input vector</param>
/// <returns>The output vector</returns>
private List<float> EvaluateInput(List<float> inputs)
{
return m_model.Evaluate(new Dictionary<string, List<float>>() { { m_inKey, inputs } }, m_outKey);
}
/// <summary>
/// Disposes of the resources
/// </summary>
private void Dispose()
{
m_model.Dispose();
}
}
}

Просмотреть файл

@ -7,9 +7,14 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Linq.Expressions;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.MSR.CNTK.Extensibility.Managed;
namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
{
@ -20,6 +25,9 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
/// This program is a managed client using the CLIWrapper to run the model evaluator in CNTK.
/// There are four cases shown in this program related to model loading, network creation and evaluation.
///
/// To run this program from the CNTK binary drop, you must add the NuGet package for model evaluation first.
/// Refer to <see cref="https://github.com/Microsoft/CNTK/wiki/NuGet-Package"/> for information regarding the NuGet package for model evaluation.
///
/// EvaluateModelSingleLayer and EvaluateModelMultipleLayers
/// --------------------------------------------------------
/// These two cases require the 01_OneHidden model which is part of the <CNTK>/Examples/Image/MNIST example.
@ -30,6 +38,19 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
/// ----------------------------------------------------------------
/// These two cases do not required a trained model (just the network description). These cases show how to extract values from a single forward-pass
/// without any input to the model.
///
/// EvaluateMultipleModels
/// ----------------------
/// This case requires the 02_Convolution model and the Test-28x28_cntk_text.txt test file which are part of the <CNTK>/Examples/Image/MNIST example.
/// Refer to <see cref="https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/README.md"/> for how to train
/// the model used in this example.
///
/// EvaluateImageClassificationModel
/// -----------------------
/// This case requires the ResNet_18 trained model which can be downloaded from <see cref="https://www.cntk.ai/resnet/ResNet_18.model"/>.
/// This case shows how to evaluate a model that was trained with the ImageReader.
/// The input for evaluation needs to be transformed in a similar manner as the ImageReader did during training.
///
/// </description>
class Program
{
@ -42,7 +63,7 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
private static void Main(string[] args)
{
initialDirectory = Environment.CurrentDirectory;
Console.WriteLine("====== EvaluateModelSingleLayer ========");
EvaluateModelSingleLayer();
@ -55,6 +76,15 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
Console.WriteLine("\n====== EvaluateNetworkSingleLayerNoInput ========");
EvaluateNetworkSingleLayerNoInput();
Console.WriteLine("\n====== EvaluateExtendedNetworkSingleLayerNoInput ========");
EvaluateExtendedNetworkSingleLayerNoInput();
Console.WriteLine("\n====== EvaluateMultipleModels ========");
EvaluateMultipleModels();
Console.WriteLine("\n====== EvaluateModelImageInput ========");
EvaluateImageClassificationModel();
Console.WriteLine("Press <Enter> to terminate.");
Console.ReadLine();
}
@ -83,11 +113,11 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId: -1);
// Generate random input values in the appropriate structure and size
var inDims = model.GetNodeDimensions(NodeGroup.nodeInput);
var inDims = model.GetNodeDimensions(NodeGroup.Input);
var inputs = GetDictionary(inDims.First().Key, inDims.First().Value, 255);
// We request the output layer names(s) and dimension, we'll use the first one.
var outDims = model.GetNodeDimensions(NodeGroup.nodeOutput);
var outDims = model.GetNodeDimensions(NodeGroup.Output);
outputLayerName = outDims.First().Key;
// We can call the evaluate method and get back the results (single layer)...
outputs = model.Evaluate(inputs, outputLayerName);
@ -124,20 +154,20 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
using (var model = new IEvaluateModelManagedF())
{
// Desired output layers
string hiddenLayerName = "h1.z";
string outputLayerName = "ol.z";
const string hiddenLayerName = "h1.z";
const string outputLayerName = "ol.z";
// Load model
string modelFilePath = Path.Combine(Environment.CurrentDirectory, @"..\Output\Models\01_OneHidden");
List<string> desiredOutputLayers = new List<string>() { hiddenLayerName, outputLayerName };
var desiredOutputLayers = new List<string>() { hiddenLayerName, outputLayerName };
model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId: -1, outputNodeNames: desiredOutputLayers);
// Generate random input values in the appropriate structure and size
var inDims = model.GetNodeDimensions(NodeGroup.nodeInput);
var inDims = model.GetNodeDimensions(NodeGroup.Input);
var inputs = GetDictionary(inDims.First().Key, inDims.First().Value, 255);
// We request the output layer names(s) and dimension, we'll get both the hidden layer and the output layer
var outDims = model.GetNodeDimensions(NodeGroup.nodeOutput);
var outDims = model.GetNodeDimensions(NodeGroup.Output);
// We can preallocate the output structure and pass it in (multiple output layers)
outputs = new Dictionary<string, List<float>>()
@ -187,7 +217,7 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
var inputs = new Dictionary<string, List<float>>() { { "features", new List<float>() { 1.0f } } };
// We can call the evaluate method and get back the results (single layer output)...
var outDims = model.GetNodeDimensions(NodeGroup.nodeOutput);
var outDims = model.GetNodeDimensions(NodeGroup.Output);
outputLayerName = outDims.First().Key;
outputs = model.Evaluate(inputs, outputLayerName);
}
@ -242,6 +272,209 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
}
}
/// <summary>
/// Evaluates an extended network (without a model and without input) and obtains a single layer output
/// </summary>
private static void EvaluateExtendedNetworkSingleLayerNoInput()
{
const string modelDefinition = @"precision = ""float""
traceLevel = 1
run=NDLNetworkBuilder
NDLNetworkBuilder=[
v1 = Constant(1)
v2 = Constant(2, tag=""output"")
ol = Plus(v1, v2, tag=""output"")
FeatureNodes = (v1)
]";
try
{
// The examples assume the executable is running from the data folder
// We switch the current directory to the data folder (assuming the executable is in the <CNTK>/x64/Debug|Release folder
string workingDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Other\Simple2d\Config");
Environment.CurrentDirectory = initialDirectory;
using (var model = new ModelEvaluationExtendedF())
{
// Create the network
// This network (AddOperatorConstantNoInput.cntk) is a simple network consisting of a single binary operator (Plus)
// operating over a two constants, therefore no input is necessary.
model.CreateNetwork(modelDefinition);
VariableSchema outputSchema = model.GetOutputSchema();
var outputNodeNames = outputSchema.Select(s => s.Name).ToList<string>();
model.StartForwardEvaluation(outputNodeNames);
var outputBuffer = outputSchema.CreateBuffers<float>();
var inputBuffer = new ValueBuffer<float>[0];
// We can call the evaluate method and get back the results...
model.ForwardPass(inputBuffer, outputBuffer);
// We expect two outputs: the v2 constant, and the ol Plus result
var expected = new float[][] { new float[] { 2 }, new float[] { 3 } };
Console.WriteLine("Expected values: {0}", string.Join(" - ", expected.Select(b => string.Join(", ", b)).ToList<string>()));
Console.WriteLine("Actual Values : {0}", string.Join(" - ", outputBuffer.Select(b => string.Join(", ", b.Buffer)).ToList<string>()));
}
}
catch (CNTKException ex)
{
Console.WriteLine("Error: {0}\nNative CallStack: {1}\n Inner Exception: {2}", ex.Message, ex.NativeCallStack, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
}
catch (Exception ex)
{
Console.WriteLine("Error: {0}\nCallStack: {1}\n Inner Exception: {2}", ex.Message, ex.StackTrace, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
}
}
/// <summary>
/// Evaluates multiple instances of a model in the same process.
/// </summary>
/// <remarks>
/// Although all models execute concurrently (multiple tasks), each model is evaluated with a single task at a time.
/// </remarks>
private static void EvaluateMultipleModels()
{
// Specifies the number of models in memory as well as the number of parallel tasks feeding these models (1 to 1)
int numConcurrentModels = 4;
// Specifies the number of times to iterate through the test file (epochs)
int numRounds = 1;
// Counts the number of evaluations accross all models
int count = 0;
// Counts the number of failed evaluations (output != expected) accross all models
int errorCount = 0;
// The examples assume the executable is running from the data folder
// We switch the current directory to the data folder (assuming the executable is in the <CNTK>/x64/Debug|Release folder
Environment.CurrentDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\MNIST\Data\");
// Load model
string modelFilePath = Path.Combine(Environment.CurrentDirectory, @"..\Output\Models\02_Convolution");
// Initializes the model instances
ModelEvaluator.Initialize(numConcurrentModels, modelFilePath);
string testfile = Path.Combine(Environment.CurrentDirectory, @"Test-28x28_cntk_text.txt");
Stopwatch sw = new Stopwatch();
sw.Start();
try
{
for (int i = 0; i < numRounds; i++)
{
// Feed each line to a single model in parallel
Parallel.ForEach(File.ReadLines(testfile), new ParallelOptions() { MaxDegreeOfParallelism = numConcurrentModels }, (line) =>
{
Interlocked.Increment(ref count);
// The file format correspond to the CNTK Text Format Reader format (https://github.com/Microsoft/CNTK/wiki/CNTKTextFormat-Reader)
var sets = line.Split('|');
var labels = sets[1].Trim().Split(' ').Skip(1);
var features = sets[2].Trim().Split(' ').Skip(1);
// Retrieve the 1-hot vector with the label index
var expected = labels.Select(float.Parse).Select((v, index) => new { Value = v, Index = index })
.Aggregate((a, b) => (a.Value > b.Value) ? a : b)
.Index;
// Retrieve the features
var inputs = features.Select(float.Parse).ToList();
// We can call the evaluate method and get back the results (single layer)...
var outputs = ModelEvaluator.Evaluate(inputs);
// Retrieve the outcome index (so we can compare it with the expected index)
var max = outputs.Select((v, index) => new { Value = v, Index = index })
.Aggregate((a, b) => (a.Value > b.Value) ? a : b)
.Index;
// Count the errors
if (expected != max)
{
Interlocked.Increment(ref errorCount);
}
});
}
}
catch (CNTKException ex)
{
Console.WriteLine("Error: {0}\nNative CallStack: {1}\n Inner Exception: {2}", ex.Message, ex.NativeCallStack, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
}
catch (Exception ex)
{
Console.WriteLine("Error: {0}\nCallStack: {1}\n Inner Exception: {2}", ex.Message, ex.StackTrace, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
}
sw.Stop();
ModelEvaluator.DisposeAll();
Console.WriteLine("The file {0} was processed using {1} concurrent model(s) with an error rate of: {2:P2} ({3} error(s) out of {4} record(s)), and a throughput of {5:N2} records/sec", @"Test-28x28_cntk_text.txt",
numConcurrentModels, (float)errorCount / count, errorCount, count, (count + errorCount) * 1000.0 / sw.ElapsedMilliseconds);
}
/// <summary>
/// This method shows how to evaluate a trained image classification model
/// </summary>
public static void EvaluateImageClassificationModel()
{
try
{
// This example requires the RestNet_18 model.
// The model can be downloaded from <see cref="https://www.cntk.ai/resnet/ResNet_18.model"/>
// The model is assumed to be located at: <CNTK>\Examples\Image\Miscellaneous\ImageNet\ResNet
// along with a sample image file named "zebra.jpg".
string workingDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\Miscellaneous\ImageNet\ResNet");
Environment.CurrentDirectory = initialDirectory;
List<float> outputs;
using (var model = new IEvaluateModelManagedF())
{
string modelFilePath = Path.Combine(workingDirectory, "ResNet_18.model");
model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId: -1);
// Prepare input value in the appropriate structure and size
var inDims = model.GetNodeDimensions(NodeGroup.Input);
if (inDims.First().Value != 224 * 224 * 3)
{
throw new CNTKRuntimeException(string.Format("The input dimension for {0} is {1} which is not the expected size of {2}.", inDims.First(), inDims.First().Value, 224 * 224 * 3), string.Empty);
}
// Transform the image
string imageFileName = Path.Combine(workingDirectory, "zebra.jpg");
Bitmap bmp = new Bitmap(Bitmap.FromFile(imageFileName));
var resized = bmp.Resize(224, 224, true);
var resizedCHW = resized.ParallelExtractCHW();
var inputs = new Dictionary<string, List<float>>() { {inDims.First().Key, resizedCHW } };
// We can call the evaluate method and get back the results (single layer output)...
var outDims = model.GetNodeDimensions(NodeGroup.Output);
outputs = model.Evaluate(inputs, outDims.First().Key);
}
// Retrieve the outcome index (so we can compare it with the expected index)
var max = outputs.Select((value, index) => new { Value = value, Index = index })
.Aggregate((a, b) => (a.Value > b.Value) ? a : b)
.Index;
Console.WriteLine("Outcome: {0}", max);
}
catch (CNTKException ex)
{
Console.WriteLine("Error: {0}\nNative CallStack: {1}\n Inner Exception: {2}", ex.Message, ex.NativeCallStack, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
}
catch (Exception ex)
{
Console.WriteLine("Error: {0}\nCallStack: {1}\n Inner Exception: {2}", ex.Message, ex.StackTrace, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
}
}
/// <summary>
/// Dumps the output to the console
/// </summary>

Просмотреть файл

@ -49,25 +49,21 @@ train = [
maxEpochs = 10
]
# Note: this reader crashes if randomization is turned on.
reader = [
readerType = "UCIFastReader"
# To get the data (Train-28x28.txt) please run `python mnist_convert.py`
# from the 'AdditionalFiles' folder. See REAMDE.md for details.
file = "$DataDir$/Train-28x28.txt"
features = [
dim = 784
start = 1
readerType = "CNTKTextFormatReader"
# See ../README.md for details on getting the data (Train-28x28_cntk_text.txt).
file = "$DataDir$/Train-28x28_cntk_text.txt"
input = [
features = [
dim = 784
format = "dense"
]
labels = [
dim = 10
format = "dense"
]
]
labels = [
dim = 1
start = 0
labelDim = 10
labelMappingFile = "$DataDir$/labelsmap.txt"
]
]
]
]
#######################################
@ -83,19 +79,17 @@ test = [
]
reader = [
readerType = "UCIFastReader"
file = "$DataDir$/Test-28x28.txt"
features = [
dim = 784
start = 1
]
labels = [
dim = 1
start = 0
labelDim = 10
labelMappingFile = "$DataDir$/labelsmap.txt"
readerType = "CNTKTextFormatReader"
file = "$DataDir$/Test-28x28_cntk_text.txt"
input = [
features = [
dim = 784
format = "dense"
]
labels = [
dim = 10
format = "dense"
]
]
]
]

Просмотреть файл

@ -7,7 +7,7 @@
|Purpose |This example demonstrates usage of the NDL (Network Description Language) to define networks.
|Network |NDLNetworkBuilder, simple feed forward and convolutional networks, cross entropy with softmax.
|Training |Stochastic gradient descent both with and without momentum.
|Comments |There are two config files, details are provided below.
|Comments |There are four config files, details are provided below.
## Running the example
@ -57,7 +57,7 @@ The output folder will be created inside Image/MNIST/.
### Config files
There are three config files and corresponding network description files in the 'Config' folder:
There are four config files and the corresponding network description files in the 'Config' folder:
1. 01_OneHidden.ndl is a simple, one hidden layer network that produces 2.3% of error.
To run the sample, navigate to the Data folder and run the following command:
@ -74,7 +74,11 @@ As a result, it achieves around 0.8% of error after training for just 2 epochs (
To run the sample, navigate to the Data folder and run the following command:
`cntk configFile=../Config/03_ConvBatchNorm.cntk`
For more details, refer to .ndl and corresponding .cntk files.
4. 04_DeConv.ndl illustrates the usage of Deconvolution and Unpooling. It is a network with one Convolution, one Pooling, one Unpooling and one Deconvolution layer. In fact it is an auto-encoder network where Rectified Linear Unit (ReLU) or Sigmoid layer is now replaced with Convolutional ReLU (for encoding) and Deconvolutional ReLU (for decoding) layers. The network goal is to reconstruct the original signal, with Mean Squared Error (MSE) used to minimize the reconstruction error. Generally such networks are used in semantic segmentation.
To run the sample, navigate to the Data folder and run the following command:
`cntk configFile=../Config/04_DeConv.cntk`
For more details, refer to .ndl and the corresponding .cntk files.
### Additional files

Двоичные данные
Examples/Image/Miscellaneous/ImageNet/ResNet/zebra.jpg Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 92 KiB

Просмотреть файл

@ -0,0 +1,226 @@
# The configuration file to build language understanding model with ATIS corpus.
# An LSTM model is built to tag each word in sentences with its semantic label.
WorkDir = work
DataDir = data
modelPath = $WorkDir$/ATIS.slot.lstm
parallelTrain = true
#stderr = $WorkDir$/log
command = Train:Output:Test
precision = "float"
deviceId = "-1" # change to "auto" to use GPUs
wordCount = 944 # number of words
labelCount = 127 # number of labels
# The command to train the LSTM model
Train = [
action = train
BrainScriptNetworkBuilder = [
inputDim = $wordCount$
labelDim = $labelCount$
featDim = inputDim*3 # contextual words are used as features: previous word, current word, next word.
embDim = 150
hiddenDim = 300
maxLayer = 1
initScale = 6
featuresPW = Input(inputDim) # the previous word
featuresCW = Input(inputDim) # the current word
featuresNW = Input(inputDim) # the next word
features = RowStack(featuresPW : featuresCW : featuresNW)
labels = Input(labelDim, tag = "label")
# embedding layer
emb = Parameter(embDim, featDim)
featEmbedded = emb * features
# build the LSTM stack
lstmDims[i:0..maxLayer-1] = hiddenDim
NoAuxInputHook (input, lstmState) = BS.Constants.None
lstmStack = BS.RNNs.RecurrentLSTMPStack (lstmDims,
cellDims=lstmDims,
featEmbedded,
inputDim=embDim,
previousHook=BS.RNNs.PreviousHC,
augmentInputHook=BS.RNNs.NoAuxInputHook,
augmentInputDim=0,
enableSelfStabilization=false)
lstmOutputLayer = Length (lstmStack)-1
LSTMoutput = lstmStack[lstmOutputLayer].h
W = Parameter(labelDim, hiddenDim, init = "uniform", initValueScale=initScale)
b = Parameter(labelDim, 1, init = "fixedValue", value=0)
outputs = W * LSTMoutput + b
cr = CrossEntropyWithSoftmax(labels, outputs)
criterionNodes = (cr)
evaluationNodes = (cr)
outputNodes = (outputs)
]
SGD = [
# maximum number of epochs
maxEpochs = 1 # set to 1 so this can be added to regression test. Increase to 20 get a good accuracy
# for each epoch, maximum number of input samples(words) is set below
epochSize = 36000
# minibatchSize should be larger than the maximum sentence length
minibatchSize = 70
learningRatesPerSample = 0.01*2:0.005*12:0.001
gradUpdateType = "FSAdaGrad"
gradientClippingWithTruncation = true
clippingThresholdPerSample = 15.0
# number of minibatches to report progress
numMBsToShowResult = 100
firstMBsToShowResult = 10
# if validation shows that the model has no improvement, then do back-up to the previously
# estimated model and reduce learning rate
loadBestModel = true
parallelTrain = [
parallelizationMethod = "DataParallelSGD"
parallelizationStartEpoch = 2
distributedMBReading = true
dataParallelSGD = [
gradientBits = 1
]
]
]
reader = [
readerType = "CNTKTextFormatReader"
file = "$DataDir$/ATIS.train.cntk.sparse"
randomize = true
input = [
featuresPW = [
alias = "PW" # previous word
dim = $wordCount$
format = "sparse"
]
featuresCW = [
alias = "CW" # current word
dim = $wordCount$
format = "sparse"
]
featuresNW = [
alias = "NW" # next word
dim = $wordCount$
format = "sparse"
]
labels = [
alias = "L" # label
dim = $labelCount$
format = "sparse"
]
]
]
]
# Evaluate the model to predict labels
Output = [
action = "write"
traceLevel = 1
epochSize = 0
defaultHiddenActivity = 0.1
BrainScriptNetworkBuilder = [
modelAsTrained = BS.Network.Load ("$modelPath$")
final = Hardmax(modelAsTrained.outputs)
]
outputPath = $WorkDir$/model.writeaction
outputNodeNames = final
reader = [
readerType = "CNTKTextFormatReader"
file = "$DataDir$/ATIS.test.cntk.sparse"
randomize = false
input = [
featuresPW = [
alias = "PW" # previous word
dim = $wordCount$
format = "sparse"
]
featuresCW = [
alias = "CW" # current word
dim = $wordCount$
format = "sparse"
]
featuresNW = [
alias = "NW" # next word
dim = $wordCount$
format = "sparse"
]
labels = [
alias = "L" # label
dim = $labelCount$
format = "sparse"
]
]
]
]
# Evaluate the model's accuracy
Test = [
action = "test"
traceLevel = 1
epochSize = 0
defaultHiddenActivity = 0.1
BrainScriptNetworkBuilder = [
labels = Input($labelCount$, tag = "label")
modelAsTrained = BS.Network.Load ("$modelPath$")
final = Hardmax(modelAsTrained.outputs)
errorRate = ErrorPrediction(labels, final, tag='evaluation')
]
evalNodeNames = errorRate
reader = [
readerType = "CNTKTextFormatReader"
file = "$DataDir$/ATIS.test.cntk.sparse"
randomize = false
input = [
featuresPW = [
alias = "PW" # previous word
dim = $wordCount$
format = "sparse"
]
featuresCW = [
alias = "CW" # current word
dim = $wordCount$
format = "sparse"
]
featuresNW = [
alias = "NW" # next word
dim = $wordCount$
format = "sparse"
]
labels = [
alias = "L" # label
dim = $labelCount$
format = "sparse"
]
]
]
]

Просмотреть файл

@ -0,0 +1,168 @@
# Build Language Understanding Models with CNTK
This example demonstrates how to use build language understanding model with CNTK using ATIS data set. This example is similar to
[SLU example](https://github.com/Microsoft/CNTK/tree/master/Examples/Text/Miscellaneous/SLU). They are different in that
- CNTKTextFormatReader is used here, instead of LUSequenceReader
- With CNTKTextFormatReader, the input format is much more flexible. In the example setting, sparse contextual feature vectors are explored
- Sparse label input is used.
The Air travel information system (ATIS) corpus is used for training and testing.
## Download the example
The data and configuration is checked in to github. You can get it by command:
`git clone https://github.com/Microsoft/cntk`
The example is under folder:
`<cntk_root>\Examples\Text\ATIS`
## Data File Format
There are four files under `data` sub-folder
|Files |Content |
|:----------------------|:--------|
|ATIS.train.cntk.sparse |featurized training data set
|ATIS.test.cntk.sparse |featurized test data set
|ATIS.vocab |all words extracted from training data. Vocab size: 944
|ATIS.labels |all semantic labels extracted from training data. Total labels: 127
We preprocess ATIS data by converting words into word indexes, and labels into label IDs in order to use
[CNTKTextFormatReader](https://github.com/Microsoft/CNTK/wiki/CNTKTextFormat-Reader). You can use any
script/tool to preprocess your text data files. In this example, data is already preprocessed.
The last two files ATIS.vocab and ATIS.labels are not really required to run the example. They are included for evaluation and debugging purpose.
E.g. they can be used to convert .sparse files back to original text files.
To understand the data format (two .sparse files), let's start with a sample sentence:
```
BOS i would like to find a flight from charlotte to Las Vegas that makes a stop in St. Louis EOS
```
it is converted into the following text:
```
1 |PW 1:1 |CW 1:1 |NW 12:1 |L 126:1
1 |PW 1:1 |CW 12:1 |NW 39:1 |L 126:1
1 |PW 12:1 |CW 39:1 |NW 28:1 |L 126:1
1 |PW 39:1 |CW 28:1 |NW 3:1 |L 126:1
1 |PW 28:1 |CW 3:1 |NW 86:1 |L 126:1
1 |PW 3:1 |CW 86:1 |NW 15:1 |L 126:1
1 |PW 86:1 |CW 15:1 |NW 10:1 |L 126:1
1 |PW 15:1 |CW 10:1 |NW 4:1 |L 126:1
1 |PW 10:1 |CW 4:1 |NW 101:1 |L 126:1
1 |PW 4:1 |CW 101:1 |NW 3:1 |L 48:1
1 |PW 101:1 |CW 3:1 |NW 92:1 |L 126:1
1 |PW 3:1 |CW 92:1 |NW 90:1 |L 78:1
1 |PW 92:1 |CW 90:1 |NW 33:1 |L 123:1
1 |PW 90:1 |CW 33:1 |NW 338:1 |L 126:1
1 |PW 33:1 |CW 338:1 |NW 15:1 |L 126:1
1 |PW 338:1 |CW 15:1 |NW 132:1 |L 126:1
1 |PW 15:1 |CW 132:1 |NW 17:1 |L 126:1
1 |PW 132:1 |CW 17:1 |NW 72:1 |L 126:1
1 |PW 17:1 |CW 72:1 |NW 144:1 |L 71:1
1 |PW 72:1 |CW 144:1 |NW 2:1 |L 119:1
1 |PW 144:1 |CW 2:1 |NW 2:1 |L 126:1
```
where the first column identifies the sequence (sentence) ID, which is the same for all words of the same sentence. There are four input streams: PW, CW, NW, L.
The input "PW" represents the previous word ID, "CW" for current word, and "NW" for next word. Input name "L" is for labels. The input names can be anything you
like and you can add more input as needed, e.g. words in a bigger window.
Words "BOS" and "EOS" denote beginning of sentence and end of sentences respectively.
Each line above represents one sample (word). E.g. the meaning of this line: `1 |PW 4:1 |CW 101:1 |NW 3:1 |L 48:1`:
* the sequence ID is 1
* the current word is "charlotte" whose word ID is 101
* the previous word is "from" whose ID is 4
* the next word is "to" whose ID is 3
* the semantic label is "B-fromloc.city_name" whose label Id is 48.
All word IDs, label IDs and corresponding words and labels are stored in ATIS.vocab and ATIS.labels.
## CNTK Configuration
In this example, we use BrainScript to create one-layer LSTM with embedding for slot tagging. The consolidated config file is ATIS.cntk. One can check the file (with some comments)
for details, especially how the reader is configured in ATIS.cntk.
reader=[
readerType = "CNTKTextFormatReader"
file = "$DataDir$/ATIS.train.cntk.sparse"
miniBatchMode = "partial"
randomize = true
input = [
featuresPW = [
alias = "PW" # previous word
dim = $wordCount$
format = "sparse"
]
featuresCW = [
alias = "CW" # current word
dim = $wordCount$
format = "sparse"
]
featuresNW = [
alias = "NW" # next word
dim = $wordCount$
format = "sparse"
]
labels = [
alias = "L" # label
dim = $labelCount$
format = "sparse"
]
]
]
The above section tell CNTK to use CNTKTextFormatReader to read data from the file "$DataDir/ATIS.train.cntk.sparse". The same input names (PW, CW, NW, L) are used to refer inputs (features and labels) provided in data files. The input is read into different
feature vectors: featuresPW, featuresCW, featuresNW and labels. These vectors are later used to build LSTM node with BrainScript as follows.
```
featuresPW = Input(inputDim)
featuresCW = Input(inputDim)
featuresNW = Input(inputDim)
features = RowStack(featuresPW : featuresCW : featuresNW)
labels=Input(labelDim, tag="label")
# embedding layer
emb = LearnableParameter(embDim, featDim)
featEmbedded = Times(emb, features)
# build the LSTM stack
lstmDims[i:0..maxLayer] = hiddenDim
NoAuxInputHook (input, lstmState) = BS.Constants.None
lstmStack = BS.RNNs.RecurrentLSTMPStack (lstmDims,
cellDims=lstmDims,
featEmbedded,
inputDim=embDim,
previousHook=BS.RNNs.PreviousHC,
augmentInputHook=BS.RNNs.NoAuxInputHook,
augmentInputDim=0,
enableSelfStabilization=false)
lstmOutputLayer = Length (lstmStack)-1
LSTMoutput = lstmStack[lstmOutputLayer].h
```
A few other notes about the config:
- it is important to specify the format is "sparse".
- the gradUpdateType is set FSAdaGrad. This setting reports better model accuracy comparing any other update methods.
- multiple LSTM layers can be used by changing the value of maxLayer.
Three commands are configured: Train, Output and Test. The command "Train" is used to train a model, "Output" is used to evaluate the model against a test set and store
the model output, and the command "Test" is to calculate the model's accuracy.
## Run the example
One can run the example locally or on Philly (for Microsoft internal users).
To run locally,
```sh
> mkdir work # the default work_dir
> open ATIS.cntk and update the value of deviceId: -1 for CPU, auto for GPU
> cntk.exe configFile=ATIS.cntk
```
By default, the maxEpochs is set to 1 to save training time. One can change it to larger value such as 20 in order to get a good model accuracy.
Depends on GPU, it normally takes about 20 minutes to run 20 epochs on single GPU, and slot F1 score is about 93.
**For Microsoft users only**, to run the job on Philly:
- first upload data folder to philly cloud. e.g. `\\storage.gcr.philly.selfhost.corp.microsoft.com\pnrsy\<your_alias>\ATIS `
- update the config file to philly cloud, e.g. `\\storage.gcr.philly.selfhost.corp.microsoft.com\pnrsy_scratch\<your_alias>\ATIS`
- go to http://philly/ to create a new job by specifying data folder and config file, and start the job.
More details about Philly, including how to upload data to Philly and start jobs, can be found [here](https://microsoft.sharepoint.com/teams/ATISG/SitePages/Philly%20Users%20Guide.aspx)

Просмотреть файл

@ -0,0 +1,127 @@
B-aircraft_code
B-airline_code
B-airline_name
B-airport_code
B-airport_name
B-arrive_date.date_relative
B-arrive_date.day_name
B-arrive_date.day_number
B-arrive_date.month_name
B-arrive_date.today_relative
B-arrive_time.end_time
B-arrive_time.period_mod
B-arrive_time.period_of_day
B-arrive_time.start_time
B-arrive_time.time
B-arrive_time.time_relative
B-booking_class
B-city_name
B-class_type
B-compartment
B-connect
B-cost_relative
B-day_name
B-day_number
B-days_code
B-depart_date.date_relative
B-depart_date.day_name
B-depart_date.day_number
B-depart_date.month_name
B-depart_date.today_relative
B-depart_date.year
B-depart_time.end_time
B-depart_time.period_mod
B-depart_time.period_of_day
B-depart_time.start_time
B-depart_time.time
B-depart_time.time_relative
B-economy
B-fare_amount
B-fare_basis_code
B-flight
B-flight_days
B-flight_mod
B-flight_number
B-flight_stop
B-flight_time
B-fromloc.airport_code
B-fromloc.airport_name
B-fromloc.city_name
B-fromloc.state_code
B-fromloc.state_name
B-meal
B-meal_code
B-meal_description
B-mod
B-month_name
B-or
B-period_of_day
B-restriction_code
B-return_date.date_relative
B-return_date.day_name
B-return_date.day_number
B-return_date.month_name
B-return_date.today_relative
B-return_time.period_mod
B-return_time.period_of_day
B-round_trip
B-state_code
B-state_name
B-stoploc.airport_code
B-stoploc.airport_name
B-stoploc.city_name
B-stoploc.state_code
B-time
B-time_relative
B-today_relative
B-toloc.airport_code
B-toloc.airport_name
B-toloc.city_name
B-toloc.country_name
B-toloc.state_code
B-toloc.state_name
B-transport_type
I-airline_name
I-airport_name
I-arrive_date.day_number
I-arrive_time.end_time
I-arrive_time.period_of_day
I-arrive_time.start_time
I-arrive_time.time
I-arrive_time.time_relative
I-city_name
I-class_type
I-cost_relative
I-depart_date.day_number
I-depart_date.today_relative
I-depart_time.end_time
I-depart_time.period_of_day
I-depart_time.start_time
I-depart_time.time
I-depart_time.time_relative
I-economy
I-fare_amount
I-fare_basis_code
I-flight_mod
I-flight_number
I-flight_stop
I-flight_time
I-fromloc.airport_name
I-fromloc.city_name
I-fromloc.state_name
I-meal_code
I-meal_description
I-restriction_code
I-return_date.date_relative
I-return_date.day_number
I-return_date.today_relative
I-round_trip
I-state_name
I-stoploc.city_name
I-time
I-today_relative
I-toloc.airport_name
I-toloc.city_name
I-toloc.state_name
I-transport_type
O

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,944 @@
</s>
BOS
EOS
to
from
flights
the
on
what
me
flight
show
i
boston
san
a
denver
in
and
francisco
atlanta
is
pittsburgh
dallas
all
baltimore
list
philadelphia
like
are
airlines
of
between
that
washington
pm
leaving
please
morning
would
fly
for
city
fare
wednesday
first
need
after
trip
oakland
there
ground
round
does
transportation
'd
which
cheapest
you
arriving
class
before
available
american
new
fares
milwaukee
with
give
have
afternoon
york
st.
one
dc
at
way
monday
leave
arrive
airport
thursday
how
want
tuesday
nonstop
find
am
earliest
go
vegas
miami
las
united
information
orlando
phoenix
chicago
sunday
saturday
evening
charlotte
twenty
newark
can
delta
toronto
seattle
diego
kansas
indianapolis
houston
airline
noon
any
friday
lake
salt
's
next
us
o'clock
cleveland
continental
air
angeles
los
august
worth
do
fort
july
stop
code
5
seventh
early
memphis
tell
aircraft
downtown
or
june
6
louis
montreal
cincinnati
around
tomorrow
cost
going
latest
petersburg
tampa
many
minneapolis
nashville
8
get
mean
jose
detroit
10
an
departing
stopover
tacoma
by
about
twa
much
7
leaves
may
long
type
burbank
see
expensive
ticket
international
12
travel
could
dollars
than
daily
columbus
service
beach
'm
california
9
night
least
know
economy
time
4
depart
into
meal
paul
coach
book
april
airports
northwest
la
lowest
now
december
less
westchester
day
serves
it
serve
november
okay
arrives
used
field
love
last
ontario
second
county
return
kind
september
mitchell
general
as
stops
flying
2
third
be
direct
fifth
eighth
stopping
times
breakfast
out
make
capacity
car
take
schedule
seating
sixth
1000
number
goes
cities
dinner
connecting
3
dl
fourth
airfare
possible
this
has
served
meals
ninth
looking
also
restriction
week
late
eastern
returning
back
today
interested
price
business
most
prices
1991
two
types
flies
twentieth
will
through
limousine
ua
bwi
via
tenth
using
stand
plane
ap
fifteenth
guardia
same
1
should
other
arrangements
f
only
rental
then
display
your
shortest
wednesdays
listing
canadian
classes
again
numbers
thirtieth
florida
express
midwest
tickets
where
twelfth
sixteenth
h
north
eleventh
carolina
seventeenth
under
smallest
mco
distance
lunch
either
makes
if
qx
transport
far
hp
57
october
no
my
m80
thank
arizona
jfk
colorado
jersey
q
weekday
airplane
y
planes
some
departure
use
ewr
their
ohio
thirty
nineteenth
when
fourteenth
explain
layover
alaska
march
stopovers
live
people
traveling
serving
rent
hi
offer
later
yes
january
area
logan
right
booking
sfo
midnight
yn
but
during
landings
february
dfw
abbreviation
630
both
're
230
qw
boeing
coming
passengers
arrange
hours
qo
codes
trying
tower
466
canada
each
530
over
uses
arrivals
11
southwest
281
trips
838
days
those
takeoffs
lufthansa
west
1100
arrival
757
minnesota
anywhere
america
430
thrift
let
mornings
nationair
'll
kinds
cheap
close
seats
pennsylvania
name
quebec
indiana
michigan
saturdays
different
taxi
provided
rates
utah
these
starting
sometime
costs
making
bh
eighteenth
following
another
ff
near
747
ea
1992
connect
help
choices
sa
maximum
wish
1115
six
weekdays
more
total
s
dc10
d9s
2100
snack
1245
georgia
72s
73s
f28
heading
departures
amount
825
737
813
ap57
sixteen
m
sorry
serviced
three
miles
departs
1700
requesting
718
land
nevada
100
so
tennessee
tuesdays
hello
destination
reservation
texas
rentals
co
meaning
ap80
1500
270
thursdays
philly
thirteenth
services
sundays
turboprop
stands
415
provide
cars
we
great
mondays
include
sure
't
well
2134
fn
555
ord
934
connection
296
abbreviations
755
highest
hold
720
fit
80
soon
four
ten
noontime
too
offers
options
within
difference
c
restrictions
plan
originating
describe
nw
1110
connections
dulles
21
733
say
approximately
define
852
1291
rate
who
proper
beginning
being
329
352
don
1024
such
wanted
615
mealtime
provides
prefer
1288
257
across
continent
overnight
local
route
746
off
j31
closest
19
lax
l10
be1
1994
red
eye
not
aa
dca
determine
1200
1205
dtw
airfares
capacities
200
town
lga
300
1993
database
1765
eight
up
originate
look
cp
carries
here
201
located
dinnertime
1039
lastest
1222
they
just
d
limo
3724
210
stapleton
343
1145
schedules
932
nonstops
without
landing
b
midway
217
bound
727
takeoff
324
train
along
friends
transcontinental
missouri
reservations
lives
767
269
ac
atl
month
taking
repeat
845
airplanes
buy
still
itinerary
actually
earlier
various
reaching
very
names
505
grounds
ap68
must
kennedy
operation
4400
1201
297
question
combination
basis
laying
1133
650
tonight
43
ls
sam
ap58
once
nighttime
yx
kw
212
1600
tpa
prior
good
1800
819
inform
k
dc9
305
anything
771
459
calling
designate
417
spend
hou
1220
directly
jet
reverse
staying
l1011
belong
445
515
travels
order
mci
150
110
connects
charges
minimum
intercontinental
497766
sounds
811
seat
final
phl
20
start
823
1059
271
382
able
put
locate
hartfield
scheduled
run
225
1158
equipment
begins
lands
reaches
carried
wn
bn
try
included
130
continuing
india
lester
pearson
listings
1209
everywhere
sd
whether
offered
486
1300
950
usa
1045
al
currently
enroute
visit
them
takes
55
thing
705
fridays
catch
straight
advertises
having
planning
listed
1055
405
468
equal
working
sb
hopefully
dh8
symbols
sort
cover
810
operating
320
639
seventeen
1207
608
besides
companies
've
got
somebody
else
wants
level
vicinity
1940
311
mia
instead
priced
eleven
comes
greatest
summer
economic
bay
402
gets
date
1020
730
400
doesn
toward
home
1850
1505
runs
673
723
thanks
bring
zone
yyz
afternoons
non
largest
500
come
428
98
qualify
279
137338
d10
539
fine
while
665
concerning
iah
1230
oak
preferably
twelve
3357
323
nights
229
regarding
seven
inexpensive
420
416
repeating
scenario
139
82
kindly
limousines
345
afterwards
734
place
includes
106
1026
124
fifteen
bna
supper
oh
71
thereafter
2153
year
discount
1130
1030
world
trans
including
represented
o
'hare
exceeding
815
928
163
bur
419
cvg
1017
315
842
1083
0900
longest
called
snacks
645
ever
single

227
Makefile
Просмотреть файл

@ -35,6 +35,9 @@
# defaults to /usr/local/
# These can be overridden on the command line, e.g. make BUILDTYPE=debug
# TODO: Build static libraries for common dependencies that are shared by multiple
# targets, e.g. eval and CNTK.
ARCH=$(shell uname)
ifndef BUILD_TOP
@ -68,7 +71,7 @@ INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2L
# COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
CPPFLAGS:=
CXXFLAGS:= -msse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
CXXFLAGS:= -msse4.1 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
LIBPATH:=
LIBS:=
LDFLAGS:=
@ -87,7 +90,7 @@ SRC:=
all : buildall
# Set up basic nvcc options and add CUDA targets from above
CUFLAGS = -m 64
CUFLAGS = -m 64
ifdef CUDA_PATH
ifndef GDK_PATH
@ -107,7 +110,7 @@ ifdef CUDA_PATH
# This is a suggested/default location for NVML
INCLUDEPATH+=$(GDK_PATH)/include/nvidia/gdk
INCLUDEPATH+=$(CUB_PATH)
NVMLPATH=$(GDK_PATH)/src/gdk/nvml/lib
NVMLLIBPATH=$(GDK_PATH)/src/gdk/nvml/lib
# Set up CUDA includes and libraries
INCLUDEPATH += $(CUDA_PATH)/include
@ -167,6 +170,10 @@ ifdef KALDI_PATH
KALDI_LIBS += -lkaldi-util -lkaldi-matrix -lkaldi-base -lkaldi-hmm -lkaldi-cudamatrix -lkaldi-nnet -lkaldi-lat
endif
ifdef SUPPORT_AVX2
CPPFLAGS += -mavx2
endif
# Set up nvcc target architectures (will generate code to support them all, i.e. fat-binary, in release mode)
# In debug mode we will rely on JIT to create code "on the fly" for the underlying architecture
GENCODE_SM30 := -gencode arch=compute_30,code=\"sm_30,compute_30\"
@ -225,6 +232,7 @@ ORIGINDIR:='$$ORIGIN'
CNTKMATH:=cntkmath
RPATH=-Wl,-rpath,
########################################
# Build info
@ -239,7 +247,6 @@ ifneq ("$(BUILDINFO_OUTPUT)","Success")
$(error Could not generate $(BUILDINFO))
endif
########################################
# Math library
########################################
@ -269,6 +276,7 @@ COMMON_SRC =\
$(SOURCEDIR)/Common/fileutil.cpp \
MATH_SRC =\
$(SOURCEDIR)/Math/BlockHandlerSSE.cpp \
$(SOURCEDIR)/Math/CPUMatrix.cpp \
$(SOURCEDIR)/Math/CPUSparseMatrix.cpp \
$(SOURCEDIR)/Math/CPURNGHandle.cpp \
@ -282,6 +290,12 @@ MATH_SRC =\
$(SOURCEDIR)/Math/ConvolutionEngine.cpp \
$(SOURCEDIR)/Math/BatchNormalizationEngine.cpp \
ifdef SUPPORT_AVX2
MATH_SRC +=\
$(SOURCEDIR)/Math/BlockHandlerAVX.cpp \
endif
ifdef CUDA_PATH
MATH_SRC +=\
$(SOURCEDIR)/Math/GPUMatrix.cu \
@ -310,14 +324,13 @@ CNTKMATH_LIB:= $(LIBDIR)/lib$(CNTKMATH).so
ALL += $(CNTKMATH_LIB)
SRC+=$(MATH_SRC)
RPATH=-Wl,-rpath,
$(CNTKMATH_LIB): $(MATH_OBJ)
@echo $(SEPARATOR)
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
@mkdir -p $(dir $@)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(NVMLPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp
########################################
# CNTKLibrary
########################################
@ -362,6 +375,8 @@ CNTKLIBRARY_SRC =\
$(SOURCEDIR)/CNTKv2LibraryDll/Utils.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/Value.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/Variable.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/Learner.cpp \
CNTKLIBRARY_SRC+=$(CNTK_COMMON_SRC)
CNTKLIBRARY_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
@ -376,14 +391,13 @@ CNTKLIBRARY_LIB:=$(LIBDIR)/lib$(CNTKLIBRARY).so
ALL+=$(CNTKLIBRARY_LIB)
SRC+=$(CNTKLIBRARY_SRC)
RPATH=-Wl,-rpath,
$(CNTKLIBRARY_LIB): $(CNTKLIBRARY_OBJ) | $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building output for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)
########################################
# CNTKLibrary tests
########################################
@ -400,14 +414,70 @@ CNTKLIBRARY_TESTS_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJ
ALL+=$(CNTKLIBRARY_TESTS)
SRC+=$(CNTKLIBRARY_TESTS_SRC)
RPATH=-Wl,-rpath,
$(CNTKLIBRARY_TESTS): $(CNTKLIBRARY_TESTS_OBJ) | $(CNTKLIBRARY_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building output for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) -l$(CNTKMATH)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) -l$(CNTKMATH)
########################################
# LibEval
########################################
EVAL:=eval
SGDLIB_SRC=\
$(SOURCEDIR)/SGDLib/Profiler.cpp \
$(SOURCEDIR)/SGDLib/SGD.cpp
EVAL_SRC=\
$(SOURCEDIR)/EvalDll/CNTKEval.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
$(SOURCEDIR)/CNTK/ModelEditLanguage.cpp \
$(SOURCEDIR)/ActionsLib/EvalActions.cpp \
$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
EVAL_SRC+=$(SGDLIB_SRC)
EVAL_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
EVAL_SRC+=$(CNTK_COMMON_SRC)
EVAL_SRC+=$(SEQUENCE_TRAINING_LIB_SRC)
EVAL_OBJ:=$(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_SRC)))
EVAL_LIB:=$(LIBDIR)/lib$(EVAL).so
ALL+=$(EVAL_LIB)
SRC+=$(EVAL_SRC)
$(EVAL_LIB): $(EVAL_OBJ)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo Building $(EVAL_LIB) for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS)
########################################
# Eval Sample client
########################################
EVAL_SAMPLE_CLIENT:=$(BINDIR)/cppevalclient
EVAL_SAMPLE_CLIENT_SRC=\
$(SOURCEDIR)/../Examples/Evaluation/CPPEvalClient/CPPEvalClient.cpp
EVAL_SAMPLE_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_SAMPLE_CLIENT_SRC))
ALL+=$(EVAL_SAMPLE_CLIENT)
SRC+=$(EVAL_SAMPLE_CLIENT_SRC)
$(EVAL_SAMPLE_CLIENT): $(EVAL_SAMPLE_CLIENT_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $(EVAL_SAMPLE_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ -l$(EVAL) -l$(CNTKMATH)
########################################
# BinaryReader plugin
########################################
@ -692,8 +762,6 @@ CNTK_SRC =\
$(SOURCEDIR)/CNTK/CNTK.cpp \
$(SOURCEDIR)/CNTK/ModelEditLanguage.cpp \
$(SOURCEDIR)/CNTK/tests.cpp \
$(SOURCEDIR)/SGDLib/Profiler.cpp \
$(SOURCEDIR)/SGDLib/SGD.cpp \
$(SOURCEDIR)/ActionsLib/TrainActions.cpp \
$(SOURCEDIR)/ActionsLib/EvalActions.cpp \
$(SOURCEDIR)/ActionsLib/OtherActions.cpp \
@ -706,7 +774,7 @@ CNTK_SRC =\
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptTest.cpp \
CNTK_SRC+=$(SGDLIB_SRC)
CNTK_SRC+=$(CNTK_COMMON_SRC)
CNTK_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
CNTK_SRC+=$(SEQUENCE_TRAINING_LIB_SRC)
@ -721,7 +789,7 @@ $(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building output for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp
# deployable resources: standard library of BS
CNTK_CORE_BS:=$(BINDIR)/cntk.core.bs
@ -731,6 +799,127 @@ $(CNTK_CORE_BS): $(SOURCEDIR)/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@echo bin-placing deployable resource files
cp -f $^ $@
########################################
# Unit Tests
########################################
# use system pre-installed Boost libraries
# Todo: use our own version of boost libraries
BOOSTLIB_PATH = /usr/lib/x86_64-linux-gnu
BOOSTLIBS := boost_unit_test_framework boost_filesystem boost_system
UNITTEST_EVAL_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/EvalTests/EvalExtendedTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/EvalTests/stdafx.cpp
UNITTEST_EVAL_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_EVAL_SRC))
UNITTEST_EVAL := $(BINDIR)/evaltests
# Temporarily not build unit tests as the docker image does not include boost.
#ALL += $(UNITTEST_EVAL)
#SRC += $(UNITTEST_EVAL_SRC)
$(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS)) -l$(EVAL) -l$(CNTKMATH)
#TODO: create project specific makefile or rules to avoid adding project specific path to the global path
INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader
UNITTEST_READER_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/HTKLMFReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ImageReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ReaderLibTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/UCIFastReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/stdafx.cpp \
$(SOURCEDIR)/Readers/CNTKTextFormatReader/Indexer.cpp \
$(SOURCEDIR)/Readers/CNTKTextFormatReader/TextParser.cpp \
UNITTEST_READER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_READER_SRC))
UNITTEST_READER := $(BINDIR)/readertests
# Temporarily not build unit tests as the docker image does not include boost.
#ALL += $(UNITTEST_READER)
#SRC += $(UNITTEST_READER_SRC)
$(UNITTEST_READER): $(UNITTEST_READER_OBJ) | $(HTKMLFREADER) $(HTKDESERIALIZERS) $(UCIFASTREADER) $(COMPOSITEDATAREADER) $(IMAGEREADER) $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS)) -l$(CNTKMATH)
UNITTEST_NETWORK_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/OperatorEvaluation.cpp \
$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/stdafx.cpp \
$(SOURCEDIR)/CNTK/ModelEditLanguage.cpp \
$(SOURCEDIR)/ActionsLib/TrainActions.cpp \
$(SOURCEDIR)/ActionsLib/EvalActions.cpp \
$(SOURCEDIR)/ActionsLib/OtherActions.cpp \
$(SOURCEDIR)/ActionsLib/SpecialPurposeActions.cpp \
$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptTest.cpp \
UNITTEST_NETWORK_SRC += $(COMPUTATION_NETWORK_LIB_SRC)
UNITTEST_NETWORK_SRC += $(CNTK_COMMON_SRC)
UNITTEST_NETWORK_SRC += $(SEQUENCE_TRAINING_LIB_SRC)
UNITTEST_NETWORK_SRC += $(SGDLIB_SRC)
UNITTEST_NETWORK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_NETWORK_SRC)))
UNITTEST_NETWORK := $(BINDIR)/networktests
# Temporarily not build unit tests as the docker image does not include boost.
#ALL += $(UNITTEST_NETWORK)
#SRC += $(UNITTEST_NETWORK_SRC)
$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS)) -l$(CNTKMATH) $(LIBS)
UNITTEST_MATH_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/BlockMultiplierTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/constants.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUMatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUSparseMatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/fixtures.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixCudaBlasTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixBlasTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixDataSynchronizationTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixFileWriteReadTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixSparseDenseInteractionsTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \
UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))
UNITTEST_MATH := $(BINDIR)/mathtests
# Temporarily not build unit tests as the docker image does not include boost.
#ALL += $(UNITTEST_MATH)
#SRC += $(UNITTEST_MATH_SRC)
$(UNITTEST_MATH): $(UNITTEST_MATH_OBJ) | $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS)) $(LIBS) -l$(CNTKMATH)
unittests: $(UNITTEST_EVAL) $(UNITTEST_READER) $(UNITTEST_NETWORK) $(UNITTEST_MATH)
########################################
# General compile and dependency rules
########################################
@ -755,13 +944,13 @@ $(OBJDIR)/%.o : %.cu $(BUILD_CONFIGURATION)
@mkdir -p $(dir $@)
$(NVCC) -c $< -o $@ $(COMMON_FLAGS) $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler "-fPIC -Werror"
$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
@echo $(SEPARATOR)
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
@mkdir -p $(dir $@)
$(CXX) -c $< -o $@ $(COMMON_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}
.PHONY: clean buildall all
.PHONY: clean buildall all unittests
clean:
@echo $(SEPARATOR)

Просмотреть файл

@ -1,18 +1,17 @@
# CNTK
## Latest news
*2016-06-16.* V 1.5 Binary release. NuGet Package with CNTK Model Evaluation Libraries.
NuGet Package is added to CNTK v.1.5 binaries. See [CNTK Releases page](https://github.com/Microsoft/CNTK/releases) and [NuGet Package description](https://github.com/Microsoft/CNTK/wiki/Nuget-Package-for-Evaluation).
*2016-07-15.* V 1.6 Binary release
CNTK v.1.6 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
*2016-06-15.* CNTK now supports building against a custom Intel® Math Kernel Library (MKL).
See [setup instructions](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine) on how to set this up for your platform.
*2016-07-12.* We have further expanded Licensing options for CNTK 1bit-SGD and related components. See the details at the [Wiki page](https://github.com/microsoft/cntk/wiki/CNTK-1bit-SGD-License). These new options are an extension of the new CNTK 1bit-SGD License that we have announced on Jun 23, 2016.
*2016-06-10.* See CNTK v.1.5 binary release announcement in the official [Microsoft Research Blog](https://blogs.msdn.microsoft.com/msr_er/2016/06/10/microsoft-improves-programming-flexibility-of-its-ai-toolkit/)
*2016-07-05.* CNTK now supports *Deconvolution* and *Unpooling*. See the usage example in the Network number 4 in [MNIST Sample](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/README.md).
*2016-06-08.* V 1.5 Binary release
CNTK v.1.5 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
*2016-06-23.* New License Terms for CNTK 1bit-SGD and related components.
Effective immediately the License Terms for CNTK 1bit-SGD and related components have changed. The new Terms provide more flexibility and enable new usage scenarios, especially in commercial environments. Read the new Terms at the [standard location](https://cntk1bitsgd.codeplex.com/license). Please note, that while the new Terms are significantly more flexible comparing to the previous ones, they are still **more restrictive** than the main CNTK License. Consequently everything described in [Enabling 1bit-SGD](https://github.com/Microsoft/CNTK/wiki/Enabling-1bit-SGD) section of the Wiki remains valid.
*2016-06-01.* An updated version of the network-description language has been made available under the new [BrainScript Network Builder](https://github.com/Microsoft/CNTK/wiki/BrainScript-Network-Builder), which features full expression parsing, recursive functions, and more.
*2016-06-20.* A [post](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/) on Intel MKL and CNTK is published in the [Intel IT Peer Network](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/)
See [all news](https://github.com/Microsoft/CNTK/wiki/News).
@ -45,3 +44,8 @@ Amit Agarwal, Eldar Akchurin, Chris Basoglu, Guoguo Chen, Scott Cyphers, Jasha D
## Disclaimer
CNTK is in active use at Microsoft and constantly evolving. There will be bugs.
## Microsoft Open Source Code of Conduct
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

24
Scripts/README.md Normal file
Просмотреть файл

@ -0,0 +1,24 @@
This directory contains different script helping using different components of CNTK.
### CNTK Text format Converters
Two Python Scripts for converting Data to CNTK Text format for using as an input for CNTK Text Format Reader (see https://github.com/microsoft/cnTK/wiki/CNTKTextFormat-Reader).
```
txt2ctf.py
```
Converts a set of dictionary files and a plain text file to CNTK Text format. Run ```python txt2ctf.py -h``` to see usage instructions. See the comments in the beginning of the script file for the specific usage example.
```
uci2ctf.py
```
Converts data stored in a text file in UCI format to CNTK Text format. Run ```python uci2ctf.py -h``` to see usage instructions and example. Also see a usage example below:
```
python Scripts/uci2ctf.py --input_file Examples/Image/MNIST/Data/Train-28x28.txt --features_start 1 --features_dim 784 --labels_start 0 --labels_dim 1 --num_labels 10 --output_file Examples/Image/MNIST/Data/Train-28x28_cntk_text.txt
```
```input_file``` – original dataset in the (columnar) UCI format
```features_start``` – index of the first feature column (start parameter in the UCIFastReader config, see https://github.com/Microsoft/CNTK/wiki/UCI-Fast-Reader)
```features_dim``` – number of feature columns (dim parameter in the UCIFastReader config)
```labels_start``` - index of the first label column
```labels_dim``` – number of label columns
```num_labels``` – number of possible label values (labelDim parameter in the UCIFastReader config)
```output_file``` – path and filename of the resulting dataset.

@ -1 +1 @@
Subproject commit 18fcb1a9378432ae179948b0f1e281115a2c7d86
Subproject commit f7afb8c6a08a6652d84de1b62377175788be5284

Просмотреть файл

@ -191,6 +191,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
else if (EqualInsensitive(nodeType, OperationNameOf(KhatriRaoProductNode), L"ColumnwiseCrossProduct")) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LearnableParameter), L"Parameter")) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LogNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LogPlusNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LogSoftmaxNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LogisticNode), L"Logistic")) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LookupTableNode))) ret = true;

Просмотреть файл

@ -53,7 +53,6 @@ public:
__declspec_noreturn static inline void EvaluationError(const wstring &msg, TextLocation where)
{
//Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
throw EvaluationException(msg, where);
}

Просмотреть файл

@ -89,9 +89,18 @@ struct Issue
// Because it is often hard to recognize an issue only from the point where it occurred, we also report the history in compact visual form.
// Since often multiple contexts are on the same source line, we only print each source line once in a consecutive row of contexts.
/*static*/ void TextLocation::PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
{
wstring error = CreateIssueMessage(locations, errorKind, kind, what);
fprintf(stderr, "%ls", error.c_str());
fflush(stderr);
}
/*static*/ wstring TextLocation::CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
{
vector<Issue> issues; // tracing the error backwards
size_t symbolIndex = 0;
wstring message;
for (size_t n = 0; n < locations.size(); n++)
{
let& location = locations[n];
@ -125,20 +134,23 @@ struct Issue
if (!locations.empty()) // (be resilient to some throwers not having a TextLocation; to be avoided)
{
let& firstLoc = issues.front().location;
fprintf(stderr, "[CALL STACK]\n");
message += wstrprintf(L"[CALL STACK]\n");
for (auto i = issues.rbegin(); i != issues.rend(); i++)
{
let& issue = *i;
auto& where = issue.location;
const auto& lines = where.GetSourceFile().lines;
const auto line = (where.lineNo == lines.size()) ? L"(end)" : lines[where.lineNo].c_str();
fprintf(stderr, " %ls\n %ls\n", line, issue.markup.c_str());
message += wstrprintf(L" %ls\n %ls\n", line, issue.markup.c_str());
}
fprintf(stderr, "%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
message += wstrprintf(L"%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
}
else
fprintf(stderr, "%ls while %ls", errorKind, kind);
fprintf(stderr, ": %ls\n", what), fflush(stderr);
{
message += wstrprintf(L"%ls while %ls", errorKind, kind);
}
message += wstrprintf(L": %ls\n", what);
return message;
}
/*static*/ vector<SourceFile> TextLocation::sourceFileMap;

Просмотреть файл

@ -37,6 +37,7 @@ struct TextLocation // position in the text. Lightweight value struct that we ca
// helpers for pretty-printing errors: Show source-code line with ...^ under it to mark up the point of error
static void PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
static std::wstring CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
static void Trace(TextLocation, const wchar_t* traceKind, const wchar_t* op, const wchar_t* exprPath);
// construction
@ -77,8 +78,12 @@ public:
} // where the error happened
virtual const wchar_t* kind() const = 0; // e.g. "warning" or "error"
wstring GetError(const std::wstring& linePrefix) const override
{
return TextLocation::CreateIssueMessage(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
}
// pretty-print this as an error message
void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const
void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const override
{
TextLocation::PrintIssue(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
}

Просмотреть файл

@ -26,15 +26,36 @@ IntDiv(x, y) = new NumericFunction [ what = 'IntDiv' ; args = (x:y) ]
##############################################################################
# comparison functions
# aliases
##############################################################################
Less = CNTK2.Less
Equal = CNTK2.Equal
Greater = CNTK2.Greater
GreaterEqual = CNTK2.GreaterEqual
NotEqual = CNTK2.NotEqual
LessEqual = CNTK2.LessEqual
Less = CNTK2.Less
Equal = CNTK2.Equal
Greater = CNTK2.Greater
GreaterEqual = CNTK2.GreaterEqual
NotEqual = CNTK2.NotEqual
LessEqual = CNTK2.LessEqual
Splice = CNTK2.Splice
TransposeDimensions = CNTK2.TransposeDimensions
Times = CNTK2.Times
Abs = CNTK2.Abs
Ceil = CNTK2.Ceil
CrossEntropyWithSoftmax = CNTK2.CrossEntropyWithSoftmax
Dropout = CNTK2.Dropout
ElementTimes = CNTK2.ElementTimes
ElementDivide = CNTK2.ElementDivide
ErrorPrediction = CNTK2.ErrorPrediction
Exp = CNTK2.Exp
Floor = CNTK2.Floor
Log = CNTK2.Log
Minus = CNTK2.Minus
Pass = CNTK2.Identity
Plus = CNTK2.Plus
RectifiedLinear = CNTK2.Relu
ReduceSum = CNTK2.ReduceSum
ReduceLogSum = CNTK2.ReduceLogSum
Round = CNTK2.Round
Sigmoid = CNTK2.Sigmoid
##############################################################################
# ComputationNodes
@ -87,14 +108,14 @@ CNTK2 = [
else new ComputationNode [ operation = 'Slice' ; inputs = _ /*plus the function args*/ ] # non-time axis
Splice (_, axis=1, tag='') = # TODO: This is a workaround. RowStack itself shall interpret 'axis' and be renamed to Splice().
if axis < 1 then Fail('Splice does not yet implement splicing the time axis.')
else if axis == 1 then [tag1=tag; out = RowStack (_, tag=tag1)].out
else [ # workaround: swap 'axis' to first position, RowStack, swap back
ArrayTransposeDimensions (_, axis1, axis2) = [ # transpose each element of a BS array
inputsT[i:0..Length(_)-1] = TransposeDimensions (_[i], axis1, axis2)
].inputsT
out = [tag1=tag; out=TransposeDimensions (RowStack (ArrayTransposeDimensions (_, 1, axis)), 1, axis, tag=tag)].out
].out
if axis < 1 then Fail('Splice does not yet implement splicing the time axis.')
else if axis == 1 then [tag1=tag; out = RowStack (_, tag=tag1)].out
else [ # workaround: swap 'axis' to first position, RowStack, swap back
ArrayTransposeDimensions (_, axis1, axis2) = [ # transpose each element of a BS array
inputsT[i:0..Length(_)-1] = TransposeDimensions (_[i], axis1, axis2)
].inputsT
out = [tag1=tag; out=TransposeDimensions (RowStack (ArrayTransposeDimensions (_, 1, axis)), 1, axis, tag=tag)].out
].out
// Swap two axes of a tensor
TransposeDimensions(_, axis1, axis2, tag='') = new ComputationNode [ operation = 'TransposeDimensions' ; inputs = _ /*plus the function args*/ ]
@ -120,9 +141,11 @@ CNTK2 = [
Square(_, tag='') = ElementTimes(_, _, tag=tag)
Tanh(_, tag='') = new ComputationNode [ operation = 'Tanh' ; inputs = _ /*plus the function args*/ ]
// 6. Reductions
// None so far
// 6. Reductions
# the following is a temporary workaround until we have the C++ version
ReduceLogSum (_, axis=0, tag='') = if axis != 0 then Fail("ReduceLogSum for now only supports axis=0.")
else [ tag1=tag ; axis1=axis ; out = RowSlice (0, 1, _ - LogSoftmax (_), tag=tag1) ].out
ReduceSum (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Sum" /*plus the function args*/ ]
// 7. Control flow (if, composite etc.)
// None so far
@ -147,7 +170,7 @@ CNTK2 = [
CrossEntropyWithSoftmax(_, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (_ : outProbVectorSequence) /*plus the function args*/ ]
ErrorPrediction(_, outVectorSequence, topN=1, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = if topN == 1 then (_ : outVectorSequence) else (_ : outVectorSequence : Constant (topN)) /*plus the function args*/ ]
// 13. Comparison nodes
// 12. Comparison nodes
Less(_, y, tag='') = new ComputationNode [ operation = 'Less' ; inputs = (_ : y) /*plus the function args*/ ]
Equal(_, y, tag='') = new ComputationNode [ operation = 'Equal' ; inputs = (_ : y) /*plus the function args*/ ]
Greater(_, y, tag='') = new ComputationNode [ operation = 'Greater' ; inputs = (_ : y) /*plus the function args*/ ]
@ -155,8 +178,7 @@ CNTK2 = [
NotEqual(_, y, tag='') = new ComputationNode [ operation = 'NotEqual' ; inputs = (_ : y) /*plus the function args*/ ]
LessEqual(_, y, tag='') = new ComputationNode [ operation = 'LessEqual' ; inputs = (_ : y) /*plus the function args*/ ]
// 13. Others
// 12. Others
// 13. Others
Identity(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _ /*plus the function args*/ ]
]
@ -181,19 +203,6 @@ Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag=
RowSlice(beginIndex, numRows, input, tag='') = Slice(beginIndex, beginIndex + numRows, input, axis = 1)
RowRepeat(input, numRepeats, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]
RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]
Splice (inputs, axis=1, tag='') = # TODO: This is a workaround. RowStack itself shall interpret 'axis' and be renamed to Splice().
if axis < 1 then Fail('Splice does not yet implement splicing the time axis.')
else if axis == 1 then [tag1=tag; out = RowStack (inputs, tag=tag1)].out
else [ # workaround: swap 'axis' to first position, RowStack, swap back
ArrayTransposeDimensions (inputs, axis1, axis2) = [ # transpose each element of a BS array
inputsT[i:0..Length(inputs)-1] = TransposeDimensions (inputs[i], axis1, axis2)
].inputsT
out = [tag1=tag; out=TransposeDimensions (RowStack (ArrayTransposeDimensions (inputs, 1, axis)), 1, axis, tag=tag)].out
].out
Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'LegacyReshape' ; inputs = input /*plus the function args*/ ]
NewReshape(input, dims, beginAxis=0, endAxis=0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
ReshapeDimension(x, axis, tensorShape) = NewReshape(x, tensorShape, beginAxis=axis, endAxis=axis + 1)
FlattenDimensions(x, axis, num) = NewReshape(x, 0, beginAxis=axis, endAxis=axis + num)
Slice(beginIndex, endIndex, input, axis=1, tag='') =
if axis < 0 then [ # time axis: specify -1
beginFlags = if beginIndex > 0 then BS.Boolean.Not (BS.Loop.IsFirstN (beginIndex, input)) else BS.Loop.IsLastN (-beginIndex, input)
@ -206,11 +215,13 @@ Slice(beginIndex, endIndex, input, axis=1, tag='') =
else BS.Sequences.Gather (flags, input)
].out
else new ComputationNode [ operation = 'Slice' ; inputs = input /*plus the function args*/ ] # non-time axis
Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'LegacyReshape' ; inputs = input /*plus the function args*/ ]
NewReshape(input, dims, beginAxis=0, endAxis=0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
ReshapeDimension(x, axis, tensorShape) = NewReshape(x, tensorShape, beginAxis=axis, endAxis=axis + 1)
FlattenDimensions(x, axis, num) = NewReshape(x, 0, beginAxis=axis, endAxis=axis + num)
SplitDimension(x, axis, N) = ReshapeDimension(x, axis, 0:N)
TransposeDimensions(input, axis1, axis2, tag='') = new ComputationNode [ operation = 'TransposeDimensions' ; inputs = input /*plus the function args*/ ]
# TODO: make input the last arg!
Transpose(x) = TransposeDimensions(x, 1, 2)
Times(A, B, outputRank=1, tag='') = new ComputationNode [ operation = 'Times' ; inputs = ( A : B ) /*plus the function args*/ ]
Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability) /*plus the function args*/ ]
WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]
ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileDynamicAxis' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
@ -228,8 +239,6 @@ ClassificationError = ErrorPrediction
Delay = PastValue
BatchNormalization(input, scale, bias, runMean, runInvStdDev, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
Abs(x, tag='') = new ComputationNode [ operation = 'Abs' ; inputs = x /*plus the function args*/ ]
Ceil(x, tag='') = Negate(Floor(Negate(x)), tag=tag)
ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ]
Clip(minValue, maxValue, x, tag='') = new ComputationNode [ operation = 'Clip' ; inputs = (minValue : maxValue : x) /* plus the function args*/ ]
ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
@ -238,50 +247,33 @@ CosDistance(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNod
CosDistanceWithNegativeSamples(aVectorSequence, anotherVectorSequence, numShifts, numNegSamples, tag='') = new ComputationNode [ operation = 'CosDistanceWithNegativeSamples' ; inputs = (aVectorSequence : anotherVectorSequence : numShifts : numNegSamples) /*plus the function args*/ ]
Cosine(x, tag='') = new ComputationNode [ operation = 'Cosine' ; inputs = x /*plus the function args*/ ]
CrossEntropy(refProbVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropy' ; inputs = (refProbVectorSequence : outProbVectorSequence) /*plus the function args*/ ]
CrossEntropyWithSoftmax(labelVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (labelVectorSequence : outProbVectorSequence) /*plus the function args*/ ]
# once ReduceLogSum becomes proper C++, CrossEntropyWithSoftmax() will become this:
NewCrossEntropyWithSoftmax (labelSequence, z, tag='') = [ tag1 = tag; out = Minus (ReduceLogSum (z), ReduceSum (labelSequence .* z), tag=tag1) ].out
DiagTimes(diagonalMatrixAsColumnVector, matrix, tag='') = new ComputationNode [ operation = 'DiagTimes' ; inputs = (diagonalMatrixAsColumnVector : matrix) /*plus the function args*/ ]
// TODO: DiagTimes = ElementTimes
Dropout(activationVectorSequence, tag='') = new ComputationNode [ operation = 'Dropout' ; inputs = activationVectorSequence /*plus the function args*/ ]
ElementTimes(aMatrix, anotherMatrix, tag='') = new ComputationNode [ operation = 'ElementTimes' ; inputs = (aMatrix : anotherMatrix) /*plus the function args*/ ]
ElementDivide(aMatrix, anotherMatrix, tag='') = ElementTimes(aMatrix, Reciprocal(anotherMatrix), tag=tag)
ErrorPrediction = CNTK2.ErrorPrediction
Exp(x, tag='') = new ComputationNode [ operation = 'Exp' ; inputs = x /*plus the function args*/ ]
Floor(x, tag='') = new ComputationNode [ operation = 'Floor' ; inputs = x /*plus the function args*/ ]
GatherPacked(indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'GatherPacked' ; inputs = (indexSequence : sourceData) /*plus the function args*/ ]
GMMLogLikelihood(unnormalizedPriorVector, meansAsRows, logStdDevAsRows, dataVectorSequence, tag='') = new ComputationNode [ operation = 'GMMLogLikelihood' ; inputs = (unnormalizedPriorVector : meansAsRows : logStdDevAsRows : dataVectorSequence) /*plus the function args*/ ]
InvStdDev(dataVectorSequence, tag='') = new ComputationNode [ operation = 'InvStdDev' ; inputs = dataVectorSequence /*plus the function args*/ ]
KhatriRaoProduct(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'KhatriRaoProduct' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
Log(x, tag='') = new ComputationNode [ operation = 'Log' ; inputs = x /*plus the function args*/ ]
LogPlus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'LogPlus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
LogSoftmax(z, tag='') = new ComputationNode [ operation = 'LogSoftmax' ; inputs = z /*plus the function args*/ ]
# TODO: ^^ along axis, like Softmax
MatrixL1Reg(matrix, tag='') = new ComputationNode [ operation = 'MatrixL1Reg' ; inputs = matrix /*plus the function args*/ ]
MatrixL2Reg(matrix, tag='') = new ComputationNode [ operation = 'MatrixL2Reg' ; inputs = matrix /*plus the function args*/ ]
Mean(dataVectorSequence, tag='') = new ComputationNode [ operation = 'Mean' ; inputs = dataVectorSequence /*plus the function args*/ ]
Minus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Minus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
Negate(input, tag='') = new ComputationNode [ operation = 'Negate' ; inputs = input /*plus the function args*/ ]
PackedIndex(targetObject, indexSequence, tag='') = new ComputationNode [ operation = 'PackedIndex' ; inputs = (targetObject : indexSequence) /*plus the function args*/ ]
Pass(x, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = x /*plus the function args*/ ]
PerDimMeanVarDeNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarDeNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
Plus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Plus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
Reciprocal(z, tag='') = new ComputationNode [ operation = 'Reciprocal' ; inputs = z /*plus the function args*/ ]
RectifiedLinear(z, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = z /*plus the function args*/ ]
ReduceSum (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Sum" /*plus the function args*/ ]
# the following is a temporary workaround until we have the C++ version
ReduceLogSum (z, axis=0, tag='') = if axis != 0 then Fail("ReduceLogSum for now only supports axis=0.")
else [ tag1=tag ; axis1=axis ; out = RowSlice (0, 1, z - LogSoftmax (z), tag=tag1) ].out
//# the following is a temporary workaround until we have the C++ version
#ReduceLogSum (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "LogSum" /*plus the function args*/ ]
#ReduceMean (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Mean" /*plus the function args*/ ]
#ReduceMax (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Max" /*plus the function args*/ ]
#ReduceMin (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Min" /*plus the function args*/ ]
Round(x, tag='') = Floor(Plus(x, ConstantTensor(0.5, (1))), tag=tag)
Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ]
# TODO: Scale = ElementTimes
ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = (cond : indexSequence : sourceData) /*plus the function args*/ ]
Sigmoid(z, tag='') = new ComputationNode [ operation = 'Sigmoid' ; inputs = z /*plus the function args*/ ]
Sin(z, tag='') = new ComputationNode [ operation = 'Sin' ; inputs = z /*plus the function args*/ ]
Softmax (z, axis=0, tag='') = # TODO: replace this with more efficient version below once we have ReduceLogSum
if axis == 0 then new ComputationNode [ operation = 'Softmax' ; inputs = z /*plus the function args*/ ]

Просмотреть файл

@ -117,6 +117,23 @@ size_t GetMaxEpochs(const ConfigParameters& configParams)
return maxEpochs;
}
#ifndef CPUONLY
// abort execution is GPU is not supported (e.g. compute capability not supported)
void CheckSupportForGpu(DEVICEID_TYPE deviceId)
{
auto gpuData = GetGpuData(deviceId);
if (gpuData.validity == GpuValidity::ComputeCapabilityNotSupported)
{
InvalidArgument("CNTK: The GPU (%s) has compute capability %d.%d. CNTK is only supported on GPUs with compute capability 3.0 or greater",
gpuData.name.c_str(), gpuData.versionMajor, gpuData.versionMinor);
}
else if (gpuData.validity == GpuValidity::UnknownDevice)
{
InvalidArgument("CNTK: Unknown GPU with Device ID %d.", gpuData.deviceId);
}
}
#endif
// special temporary function to guard against a now invalid usage of "truncated" which exists in some IPG production setups
static void DisableLegacyTruncationSettings(const ConfigParameters& TopLevelConfig, const ConfigParameters& commandConfig)
{
@ -370,6 +387,30 @@ void PrintUsageInfo()
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
}
// print gpu info for current gpu devices (e.g. Device[0]: cores = 2496; computeCapability = 5.2; type = "Quadro M4000"; memory = 8192 MB)
void PrintGpuInfo()
{
#ifndef CPUONLY
std::vector<GpuData> gpusData = GetAllGpusData();
if (gpusData.empty())
{
LOGPRINTF(stderr, "No GPUs found\n");
return;
}
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "GPU info:\n\n");
for (GpuData& data : gpusData)
{
LOGPRINTF(stderr, "\t\tDevice[%d]: cores = %d; computeCapability = %d.%d; type = \"%s\"; memory = %lu MB\n",
data.deviceId, data.cudaCores, data.versionMajor, data.versionMinor, data.name.c_str(), data.totalMemory);
}
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
#endif
}
// ---------------------------------------------------------------------------
// main() for use with BrainScript
// ---------------------------------------------------------------------------
@ -461,6 +502,21 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
let valp = BS::Evaluate(expr); // evaluate parse into a dictionary
let& config = valp.AsRef<ScriptableObjects::IConfigRecord>(); // this is the dictionary
#ifndef CPUONLY
auto valpp = config.Find(L"deviceId");
if (valpp)
{
auto valp = *valpp;
if (!valp.Is<ScriptableObjects::String>()) // if it's not string 'auto' or 'cpu', then it's a gpu
{
if (static_cast<int>(valp) >= 0) // gpu (id >= 0)
{
CheckSupportForGpu(valp); // throws if gpu is not supported
}
}
}
#endif
// legacy parameters that have changed spelling
if (config.Find(L"DoneFile")) // variables follow camel case (start with lower-case letters)
InvalidArgument("Legacy spelling of 'DoneFile' no longer allowed. Use 'doneFile'.");
@ -499,6 +555,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
// echo config info to log
PrintBuiltInfo();
// echo gpu info to log
PrintGpuInfo();
// execute the actions
// std::string type = config(L"precision", "float");
int numCPUThreads = config(L"numCPUThreads", 0);
@ -556,6 +615,18 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
{
ConfigParameters config;
std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config); // get the command param set they want
#ifndef CPUONLY
ConfigValue val = config("deviceId", "auto");
if (!EqualCI(val, "cpu") && !EqualCI(val, "auto"))
{
if (static_cast<int>(val) >= 0) // gpu (id >= 0)
{
CheckSupportForGpu(static_cast<int>(val)); // throws if gpu is not supported
}
}
#endif
bool timestamping = config(L"timestamping", false);
if (timestamping)
{
@ -599,6 +670,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
}
PrintBuiltInfo(); // this one goes to log file
PrintGpuInfo();
std::string timestamp = TimeDateStamp();
// dump config info

Просмотреть файл

@ -144,6 +144,7 @@
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\Common\CrossProcessMutex.h" />
<ClInclude Include="..\Common\Include\basetypes.h" />
<ClInclude Include="..\Common\Include\Basics.h" />
<ClInclude Include="..\Common\Include\BestGpu.h" />
<ClInclude Include="..\Common\Include\DataReader.h" />
@ -222,4 +223,4 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>
</Project>

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -47,12 +47,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template <typename ElementType>
class ComputationNode;
class File;
}}}
// TODO: The following should be reconciled with the equivalent code in the CNTK implementation
#ifndef _MSC_VER
#define _countof(_Array) (sizeof(_Array) / sizeof(_Array[0]))
static inline wchar_t* _wcsdup(const wchar_t *s)
{
return ::wcsdup(s);
}
#endif
namespace CNTK
@ -131,386 +137,30 @@ namespace CNTK
// Forward declarations
class CompositeFunction;
class Function;
class Variable;
namespace _Internal
// Similar to make_shared except that it associates a custom deleter with the shared_ptr to ensure
// that objects are deleted on the same side of the library DLL where they are allocated
template <typename T, typename ...CtorArgTypes>
inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs)
{
// A reference counter to be used as the base class for all reference counted types.
class _ReferenceCounter
{
public:
// Constructor.
_ReferenceCounter() : m_rc(0) {}
// Destructor.
virtual ~_ReferenceCounter() {}
// Add a reference.
// Thread-safe.
size_t AddReference()
{
return ++m_rc;
}
// Remove a reference.
// Thread-safe.
size_t RemoveReference()
{
assert(m_rc.load() > 0);
return --m_rc;
}
// Return the reference count value
size_t GetReferenceCount()
{
return m_rc.load();
}
private:
std::atomic<size_t> m_rc;
};
// A smart pointer to a reference counted object
// T must be a type derived from _Reference_counter
template <class T>
class _ReferenceCounterSharedPtr final
{
typedef void(*_ReferenceCounterDeleter)(_ReferenceCounter* obj);
public:
// Constructor
_ReferenceCounterSharedPtr(T* ptr = nullptr, _ReferenceCounterDeleter deleter = nullptr) : m_objPtr(ptr), m_deleter(deleter)
{
Init();
}
// Copy constructor
_ReferenceCounterSharedPtr(const _ReferenceCounterSharedPtr& other) : m_objPtr(nullptr), m_deleter(nullptr)
{
*this = other;
}
// Move constructor
_ReferenceCounterSharedPtr(_ReferenceCounterSharedPtr&& other) : m_objPtr(nullptr), m_deleter(nullptr)
{
*this = std::move(other);
}
// Destructor
~_ReferenceCounterSharedPtr()
{
UnInitialize(m_objPtr, m_deleter);
}
// Assignment operator
_ReferenceCounterSharedPtr& operator=(const _ReferenceCounterSharedPtr& other)
{
if (this != &other)
{
T* oldPtr = m_objPtr;
_ReferenceCounterDeleter oldDeleter = m_deleter;
m_objPtr = other.m_objPtr;
m_deleter = other.m_deleter;
Init();
UnInitialize(oldPtr, oldDeleter);
}
return *this;
}
// Move-assignment operator
_ReferenceCounterSharedPtr& operator=(_ReferenceCounterSharedPtr&& other)
{
assert(this != &other);
T* oldPtr = m_objPtr;
_ReferenceCounterDeleter oldDeleter = m_deleter;
m_objPtr = other.m_objPtr;
m_deleter = other.m_deleter;
// No change to ref-count of the adopted pointer.
other.m_objPtr = nullptr;
other.m_deleter = nullptr;
UnInitialize(oldPtr, oldDeleter);
return *this;
}
// Conversion to a ReferenceCountedSharedPtr instance of a base type
template <typename Base, typename std::enable_if<std::is_base_of<Base, T>::value>::type* = nullptr>
operator _ReferenceCounterSharedPtr<Base>()
{
return _ReferenceCounterSharedPtr<Base>(m_objPtr, m_deleter);
}
T* operator->() const
{
return m_objPtr;
}
T& operator*() const
{
return *m_objPtr;
}
operator T*() const
{
return m_objPtr;
}
T* GetPtr() const
{
return m_objPtr;
}
private:
void Init()
{
static_assert(std::is_base_of<_ReferenceCounter, T>::value, "_ReferenceCounterSharedPtr<T> can only be used when _ReferenceCounter is a base type of T!");
if (m_objPtr != nullptr)
reinterpret_cast<_ReferenceCounter*>(m_objPtr)->AddReference();
}
static void UnInitialize(T* objPtr, _ReferenceCounterDeleter deleter)
{
static_assert(std::is_base_of<_ReferenceCounter, T>::value, "_ReferenceCounterSharedPtr<T> can only be used when _ReferenceCounter is a base type of T!");
if (objPtr != nullptr)
{
size_t refCountRemaining = reinterpret_cast<_ReferenceCounter*>(objPtr)->RemoveReference();
if (refCountRemaining == 0)
{
if (deleter != nullptr)
deleter(reinterpret_cast<_ReferenceCounter*>(objPtr));
else
delete objPtr;
}
}
}
private:
T* m_objPtr;
_ReferenceCounterDeleter m_deleter;
};
template <typename T>
bool operator==(const _ReferenceCounterSharedPtr<T>& first, const _ReferenceCounterSharedPtr<T>& second)
{
return first.GetPtr() == second.GetPtr();
}
// A simple vector implementation with a C ABI to allow usage across the library DLL boundary
// as STL vectors cannot be used across the DLL boundary
template <typename T>
class CNTK_API _SimpleVector final
{
template <typename ValueType>
friend CNTK_API bool operator==(const _SimpleVector<ValueType>& first, const _SimpleVector<ValueType>& second);
friend class CNTK::Function;
public:
_SimpleVector();
_SimpleVector(size_t numElements, const T& initVal = T());
~_SimpleVector();
_SimpleVector(const _SimpleVector& other);
_SimpleVector& operator=(const _SimpleVector& other);
_SimpleVector(_SimpleVector&& other);
_SimpleVector& operator=(_SimpleVector&& other);
T& operator[](size_t idx);
const T& operator[](size_t idx) const;
size_t Size() const;
T* Data();
const T* Data() const;
void PushBack(const T& value);
void PushBack(T&& value);
operator std::vector<T>() const
{
std::vector<T> retVector(Size());
for (size_t i = 0; i < Size(); ++i)
retVector[i] = this->operator[](i);
return retVector;
}
std::unordered_set<T> GetAsUnorderedSet(bool ensureUnique = true)
{
std::unordered_set<T> retSet;
for (size_t i = 0; i < Size(); ++i)
{
auto insertRet = retSet.insert(this->operator[](i));
if (ensureUnique && !insertRet.second)
RuntimeError("A _SimpleVector with duplicate elements cannot be converted to an unordered_set");
}
return retSet;
}
template <typename ContainerType, typename std::enable_if<std::is_same<ContainerType, std::vector<T>>::value ||
std::is_same<ContainerType, std::initializer_list<T>>::value ||
std::is_same<ContainerType, std::array<T, sizeof(ContainerType) / sizeof(T)>>::value>::type* = nullptr>
static _SimpleVector<T> CreateSimpleVector(const ContainerType& initList)
{
_SimpleVector<T> simpleVector(initList.size());
std::copy(initList.begin(), initList.end(), simpleVector.Data());
return simpleVector;
}
private:
std::vector<T>* m_vector;
};
template <typename ValueType>
CNTK_API bool operator==(const _SimpleVector<ValueType>& first, const _SimpleVector<ValueType>& second);
template <typename ValueType>
bool operator!=(const _SimpleVector<ValueType>& first, const _SimpleVector<ValueType>& second)
{
return !(first == second);
}
// A simple set implementation with a C ABI to allow usage across the library DLL boundary
// as STL sets cannot be used across the DLL boundary
template <typename KeyType>
class CNTK_API _SimpleSet final
{
friend class CNTK::CompositeFunction;
template <typename T>
friend CNTK_API bool operator==(const _SimpleSet<T>& first, const _SimpleSet<T>& second);
public:
_SimpleSet();
~_SimpleSet();
_SimpleSet(const _SimpleSet& other);
_SimpleSet& operator=(const _SimpleSet& other);
_SimpleSet(_SimpleSet&& other);
_SimpleSet& operator=(_SimpleSet&& other);
bool Insert(const KeyType& key);
bool Contains(const KeyType& key) const;
size_t Size() const;
operator _SimpleVector<KeyType>() const;
operator std::unordered_set<KeyType>() const
{
return ((_SimpleVector<KeyType>)(*this)).GetAsUnorderedSet();
}
static _SimpleSet<KeyType> CreateSimpleSet(const std::unordered_set<KeyType>& initSet)
{
_SimpleSet<KeyType> simpleSet;
for (auto iter = initSet.begin(); iter != initSet.end(); ++iter)
simpleSet.Insert(*iter);
return simpleSet;
}
private:
std::unordered_set<KeyType>* m_set;
};
template <typename KeyType>
CNTK_API bool operator==(const _SimpleSet<KeyType>& first, const _SimpleSet<KeyType>& second);
template <typename KeyType>
bool operator!=(const _SimpleSet<KeyType>& first, const _SimpleSet<KeyType>& second)
{
return !(first == second);
}
// A simple map implementation with a C ABI to allow usage across the library DLL boundary
// as STL maps cannot be used across the DLL boundary
template <typename KeyType, typename ValueType>
class CNTK_API _SimpleMap final
{
friend class CNTK::CompositeFunction;
friend class CNTK::Function;
public:
_SimpleMap();
~_SimpleMap();
_SimpleMap(const _SimpleMap& other);
_SimpleMap& operator=(const _SimpleMap& other);
_SimpleMap(_SimpleMap&& other);
_SimpleMap& operator=(_SimpleMap&& other);
ValueType& operator[](const KeyType& key);
const ValueType& operator[](const KeyType& key) const;
bool Insert(const KeyType& key, const ValueType& value);
bool Contains(const KeyType& key) const;
size_t Size() const;
_SimpleSet<KeyType> Keys() const;
static _SimpleMap<KeyType, ValueType> CreateSimpleMap(const std::unordered_map<KeyType, ValueType>& initMap)
{
_SimpleMap<KeyType, ValueType> simpleMap;
for (auto iter = initMap.begin(); iter != initMap.end(); ++iter)
simpleMap.Insert(iter->first, iter->second);
return simpleMap;
}
private:
std::unordered_map<KeyType, ValueType>* m_map;
};
auto objPtr = new T(std::forward<CtorArgTypes>(ctorArgs)...);
return std::shared_ptr<T>(objPtr, [](T* ptr) { delete ptr; });
}
// Forward declarations
class NDArrayView;
typedef _Internal::_ReferenceCounterSharedPtr<NDArrayView> NDArrayViewPtr;
typedef std::shared_ptr<NDArrayView> NDArrayViewPtr;
class NDMask;
typedef _Internal::_ReferenceCounterSharedPtr<NDMask> NDMaskPtr;
typedef std::shared_ptr<NDMask> NDMaskPtr;
class Value;
typedef _Internal::_ReferenceCounterSharedPtr<Value> ValuePtr;
typedef std::shared_ptr<Value> ValuePtr;
class Function;
typedef _Internal::_ReferenceCounterSharedPtr<Function> FunctionPtr;
typedef std::shared_ptr<Function> FunctionPtr;
inline wchar_t* CopyString(const wchar_t* source)
{
size_t len = wcslen(source) + 1;
wchar_t* copy = new wchar_t[len];
#ifdef _WIN32
wcscpy_s(copy, len, source);
#else
wcscpy(copy, source);
#endif
return copy;
}
}
namespace std {
template <typename T>
struct hash<CNTK::_Internal::_ReferenceCounterSharedPtr<T>>
{
size_t operator()(const CNTK::_Internal::_ReferenceCounterSharedPtr<T>& x) const
{
return std::hash<const void*>()(x.GetPtr());
}
};
class Learner;
typedef std::shared_ptr<Learner> LearnerPtr;
}

Просмотреть файл

@ -128,6 +128,7 @@
<ClInclude Include="API\CNTKLibrary.h" />
<ClInclude Include="API\CNTKLibraryInternals.h" />
<ClInclude Include="Function.h" />
<ClInclude Include="Learner.h" />
<ClInclude Include="Utils.h" />
<ClInclude Include="stdafx.h" />
<ClInclude Include="targetver.h" />
@ -140,6 +141,7 @@
</PrecompiledHeader>
</ClCompile>
<ClCompile Include="Function.cpp" />
<ClCompile Include="Learner.cpp" />
<ClCompile Include="NDArrayView.cpp" />
<ClCompile Include="NDMask.cpp" />
<ClCompile Include="stdafx.cpp">

Просмотреть файл

@ -10,6 +10,7 @@
<ClCompile Include="Variable.cpp" />
<ClCompile Include="Utils.cpp" />
<ClCompile Include="NDMask.cpp" />
<ClCompile Include="Learner.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="stdafx.h" />
@ -22,6 +23,7 @@
<Filter>API</Filter>
</ClInclude>
<ClInclude Include="Function.h" />
<ClInclude Include="Learner.h" />
</ItemGroup>
<ItemGroup>
<Filter Include="API">

Просмотреть файл

@ -14,7 +14,21 @@ namespace CNTK
return GPUDevice(0);
}
/*static*/ Axis Axis::DefaultDynamicAxis = Axis(L"defaultDynamicAxis");
/*static*/ Axis Axis::BatchAxis = Axis(L"batchAxis");
/*static*/ Axis Axis::AllAxes = Axis(L"allAxes");
/*static*/ const Axis& Axis::DefaultDynamicAxis()
{
static Axis s_defaultDynamicAxis(L"defaultDynamicAxis");
return s_defaultDynamicAxis;
}
/*static*/ const Axis& Axis::BatchAxis()
{
static Axis s_batchAxis(L"batchAxis");
return s_batchAxis;
}
/*static*/ const Axis& Axis::AllAxes()
{
static Axis s_allAxes(L"allAxes");
return s_allAxes;
}
}

Просмотреть файл

@ -17,40 +17,98 @@ bool g_shareNodeValueMatrices = true;
namespace CNTK
{
_Internal::_SimpleVector<Variable> Function::_Inputs() const
std::shared_ptr<std::vector<Variable>> Function::InputsImpl() const
{
const CompositeFunction* compositeFunction = dynamic_cast<const CompositeFunction*>(this);
std::vector<Variable> inputs;
if (compositeFunction == nullptr)
return m_inputs;
inputs = m_inputs;
else
return _Internal::_SimpleVector<Variable>::CreateSimpleVector(compositeFunction->DetermineInputs());
inputs = compositeFunction->DetermineInputs();
return std::shared_ptr<std::vector<Variable>>(new std::vector<Variable>(std::move(inputs)), [](std::vector<Variable>* ptr) { delete ptr; });
}
/*virtual*/ void Function::_ReplacePlaceholders(const _Internal::_SimpleMap<Placeholder, Variable>& placeholderReplacements, _Internal::_SimpleSet<const Function*>& visitedFunctions, _Internal::_SimpleSet<Placeholder>& replacedPlaceholders)
FunctionPtr Function::ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements)
{
visitedFunctions.Insert(this);
// Cannot be called on primitive functions
if (RootFunction() == nullptr)
InvalidArgument("ReplacePlaceholders should never be called on primitive functions");
for (auto iter = m_inputs.m_vector->begin(); iter != m_inputs.m_vector->end(); ++iter)
std::unordered_set<const Function*> visitedFunctions;
std::unordered_set<Placeholder> replacedPlaceholders;
ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
for (auto replacementPair : placeholderReplacements)
{
if (iter->IsPlaceholder())
if (replacedPlaceholders.find(replacementPair.first) == replacedPlaceholders.end())
InvalidArgument("At least one of the placeholders specified for replacement was not found in the function");
}
return this->shared_from_this();
}
// Placeholders can be replaced incrementally - i.e. not all placeholders need to replaced in one go.
// The only requirement is that they must all be replaced before making any 'Forward' calls on the Function instance.
/*virtual*/ void Function::ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements,
std::unordered_set<const Function*>& visitedFunctions,
std::unordered_set<Placeholder>& replacedPlaceholders)
{
visitedFunctions.insert(this);
for (auto& inputVar : m_inputs)
{
if (inputVar.IsPlaceholder())
{
Placeholder placeholder(*iter);
if (placeholderReplacements.Contains(placeholder))
Placeholder placeholder(inputVar);
if (placeholderReplacements.find(placeholder) != placeholderReplacements.end())
{
*iter = placeholderReplacements[placeholder];
replacedPlaceholders.Insert(placeholder);
inputVar = placeholderReplacements.at(placeholder);
replacedPlaceholders.insert(placeholder);
}
}
else if ((iter->Kind() == VariableKind::Output) && !visitedFunctions.Contains(iter->Owner()))
iter->Owner()->_ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
else if (inputVar.IsOutput() && (visitedFunctions.find(inputVar.Owner().get()) == visitedFunctions.end()))
inputVar.Owner()->ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
}
}
template <typename ElementType>
/*static*/ ComputationNodeBasePtr CompositeFunction::GetNode(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkPtr& network, ComputationNetworkBuilder<ElementType>& builder, std::unordered_map<Variable, ComputationNodeBasePtr>& variableToNodeMap, std::unordered_map<Variable, bool>& isVariableRootMap)
// Replace any PlaceHolder Variables in the graph of Functions underlying 'this' CompositeFunction. All PlaceHolder variables
// should have been replaced before performing any Forward compute of 'this' Function.
/*virtual*/ void CompositeFunction::ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements,
std::unordered_set<const Function*>& visitedFunctions,
std::unordered_set<Placeholder>& replacedPlaceholders)
{
if (variableToNodeMap.find(variable) != variableToNodeMap.end())
return variableToNodeMap[variable];
RootFunction()->ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
// If any of the placeholders were replaced with Output variables, let's add the graph of function underneath each of those to 'm_allPrimitiveFunctions' set
for (auto replacedPlaceholder : replacedPlaceholders)
{
auto replacingVariable = placeholderReplacements.at(replacedPlaceholder);
if (replacingVariable.IsOutput())
{
auto ownerFunc = replacingVariable.Owner();
std::unordered_set<FunctionPtr> visitedFunctions;
DetermineInputs(ownerFunc, visitedFunctions);
// Add the newly visited functions to 'm_allPrimitiveFunctions' set
m_allPrimitiveFunctions.insert(visitedFunctions.begin(), visitedFunctions.end());
}
}
}
// Recursively create a sub-network of ComputationNode instances corresponding to the graph of Functions
// underlying the specified 'variable' and return the ComputationNode instance that corresponds to the
// top level 'variable'
template <typename ElementType>
/*static*/ ComputationNodeBasePtr CompositeFunction::GetNode(const Variable& variable,
Microsoft::MSR::CNTK::ComputationNetworkPtr& network,
ComputationNetworkBuilder<ElementType>& builder,
std::unordered_map<Variable, ComputationNodeBasePtr>& variableToNodeMap,
std::unordered_map<Variable, bool>& isVariableRootMap)
{
auto iter = variableToNodeMap.find(variable);
if (iter != variableToNodeMap.end())
return iter->second;
// Lets add a null entry in the map for this variable, to break infinite recursion when processing recurrent graphs
variableToNodeMap[variable] = nullptr;
@ -66,10 +124,10 @@ namespace CNTK
auto matrix = variable.IsConstant() ? value->GetMatrix<ElementType>()->AsReference() : value->GetWritableMatrix<ElementType>()->AsReference();
computationNodePtr->Value() = std::move(matrix);
}
else if (variable.Kind() == VariableKind::Input)
else if (variable.IsInput())
{
// TODO: Specify dynamic axis
if (variable.IsSparseInput())
if (IsSparseInput(variable))
computationNodePtr = builder.CreateSparseInputNode(variable.Name(), AsTensorShape(variable.Shape()));
else
computationNodePtr = builder.CreateInputNode(variable.Name(), AsTensorShape(variable.Shape()));
@ -83,23 +141,27 @@ namespace CNTK
}
else
{
assert(variable.Kind() == VariableKind::Output);
assert(variable.IsOutput());
computationNodePtr = GetOutputVariableNode(variable, network, builder, variableToNodeMap, isVariableRootMap)->template As<ComputationNode<ElementType>>()->shared_from_this();
}
variableToNodeMap[variable] = computationNodePtr;
isVariableRootMap[variable] = (variable.Kind() == VariableKind::Output);
isVariableRootMap[variable] = variable.IsOutput();
return computationNodePtr;
}
template <typename ElementType>
/*static*/ ComputationNodeBasePtr CompositeFunction::GetOutputVariableNode(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkPtr& network, ComputationNetworkBuilder<ElementType>& builder, std::unordered_map<Variable, ComputationNodeBasePtr>& variableToNodeMap, std::unordered_map<Variable, bool>& isVariableRootMap)
/*static*/ ComputationNodeBasePtr CompositeFunction::GetOutputVariableNode(const Variable& variable,
Microsoft::MSR::CNTK::ComputationNetworkPtr& network,
ComputationNetworkBuilder<ElementType>& builder,
std::unordered_map<Variable, ComputationNodeBasePtr>& variableToNodeMap,
std::unordered_map<Variable, bool>& isVariableRootMap)
{
assert(variable.Kind() == VariableKind::Output);
assert(variable.IsOutput());
Function* function = variable.Owner();
Function* function = variable.Owner().get();
ComputationNodeBasePtr computationNodePtr;
if (dynamic_cast<PrimitiveFunction*>(function) != nullptr)
if (dynamic_cast<PrimitiveFunction*>(function))
{
PrimitiveFunction* primitiveFunction = dynamic_cast<PrimitiveFunction*>(function);
@ -134,7 +196,7 @@ namespace CNTK
case PrimitiveOpType::CrossEntropyWithSoftmax:
computationNodePtr = builder.CrossEntropyWithSoftmax(input1Node, input0Node, function->Name());
break;
case PrimitiveOpType::PredictionError:
case PrimitiveOpType::ClassificationError:
computationNodePtr = builder.ErrorPrediction(input1Node, input0Node, function->Name());
break;
case PrimitiveOpType::Exp:
@ -180,8 +242,10 @@ namespace CNTK
break;
}
case PrimitiveOpType::Combine:
for (size_t i = 0; i < functionInputs.size(); ++i)
GetNode(functionInputs[i], network, builder, variableToNodeMap, isVariableRootMap);
// This operation is just a no-op and is a means to combine multiple functions to create a single Function
// whose outputs are a union of tyhe outputs of the Functions being combined.
for (auto inputVar : functionInputs)
GetNode(inputVar, network, builder, variableToNodeMap, isVariableRootMap);
computationNodePtr = variableToNodeMap[variable];
@ -193,8 +257,8 @@ namespace CNTK
if (op != PrimitiveOpType::Combine)
{
for (size_t i = 0; i < functionInputs.size(); ++i)
isVariableRootMap[functionInputs[i]] = false;
for (auto inputVar : functionInputs)
isVariableRootMap[inputVar] = false;
}
}
else
@ -206,14 +270,14 @@ namespace CNTK
}
template <typename ElementType>
ComputationNetworkPtr CompositeFunction::GetComputationNetwork(const DeviceDescriptor& device, const _Internal::_SimpleSet<Variable>& backpropRoots)
ComputationNetworkPtr CompositeFunction::GetComputationNetwork(const DeviceDescriptor& device, const std::unordered_set<Variable>& backpropRoots)
{
if (m_computationNetwork != nullptr)
{
// TODO: We should either invalidate and readapt the network if he backpropRoots change compared to what was specified when the network
// was last constructed, to just recreate a new network.
// For now just disallow changing the backpropRoots after the network is created
if (m_currentBackpropRoots != *backpropRoots.m_set)
if (m_currentBackpropRoots != backpropRoots)
LogicError("Changing backprop roots across different Forward calls on a CNTK composite Function is currently unsupported");
// TODO: Support changing the device across different invocations of the forward method on a Function instance
@ -228,7 +292,7 @@ namespace CNTK
ComputationNetworkBuilder<ElementType> builder(*m_computationNetwork);
// TODO: We current only support one backprop root
if (backpropRoots.Size() > 1)
if (backpropRoots.size() > 1)
LogicError("More than one backprop roots is currently unsupported");
ComputationNodeBasePtr backpropRootNode;
@ -237,52 +301,52 @@ namespace CNTK
auto rootFunction = RootFunction();
auto rootFunctionOutputs = rootFunction->Outputs();
std::vector<ComputationNodeBasePtr> forwardRootNodes;
for (size_t i = 0; i < rootFunctionOutputs.size(); ++i)
for (auto rootOutput : rootFunctionOutputs)
{
auto currentRootNode = GetNode(rootFunctionOutputs[i], m_computationNetwork, builder, m_variableToNodeMap, m_isVariableRootMap);
auto currentRootNode = GetNode(rootOutput, m_computationNetwork, builder, m_variableToNodeMap, m_isVariableRootMap);
forwardRootNodes.push_back(currentRootNode);
if (backpropRoots.Contains(rootFunctionOutputs[i]))
backpropRootNode = m_variableToNodeMap[rootFunctionOutputs[i]];
if (backpropRoots.find(rootOutput) != backpropRoots.end())
backpropRootNode = m_variableToNodeMap[rootOutput];
}
// If any of the function outputs is not a root node, we need to explicitly add it to the 'output' group of the ComputationNetwork
for (size_t i = 0; i < rootFunctionOutputs.size(); ++i)
for (auto rootOutput : rootFunctionOutputs)
{
if (!m_isVariableRootMap[rootFunctionOutputs[i]])
m_computationNetwork->AddToNodeGroup(L"output", m_variableToNodeMap[rootFunctionOutputs[i]]);
if (!m_isVariableRootMap[rootOutput])
m_computationNetwork->AddToNodeGroup(L"output", m_variableToNodeMap[rootOutput]);
}
m_currentBackpropRoots = backpropRoots;
// In case of recurrence, the inputs of some of the ComputationNodes are not attached due to cycles.
// Now attach those after we have created all ComputationNodes in the network
for (auto iter = m_variableToNodeMap.begin(); iter != m_variableToNodeMap.end(); ++iter)
for (auto varNodePair : m_variableToNodeMap)
{
auto currentComputationNodeInputs = iter->second->GetInputs();
auto currentComputationNodeInputs = varNodePair.second->GetInputs();
// TODO: Can any node other than a non PastValue/FutureValue Function have a null input attached after the first pass is finished?
if (std::find(currentComputationNodeInputs.begin(), currentComputationNodeInputs.end(), nullptr) != currentComputationNodeInputs.end())
{
// We found a null input; this variable must correspond to a PastValue or FutureValue function
const PrimitiveFunction* primitiveFunc = dynamic_cast<const PrimitiveFunction*>(iter->first.Owner().GetPtr());
const PrimitiveFunction* primitiveFunc = dynamic_cast<const PrimitiveFunction*>(varNodePair.first.Owner().get());
if ((primitiveFunc == nullptr) || ((primitiveFunc->OpType() != PrimitiveOpType::PastValue) && (primitiveFunc->OpType() != PrimitiveOpType::FutureValue)))
InvalidArgument("Invalid Function graph detected; recurrence found at a Function that is not a PastValue/FutureValue function");
// The 2nd input of the PastValue/FutureValue function denotes the recurrent input
auto actualInput = m_variableToNodeMap[primitiveFunc->Inputs()[1]];
iter->second->AttachInputs({ actualInput });
varNodePair.second->AttachInputs({ actualInput });
}
}
m_computationNetwork->CompileNetwork();
// Verify that the shapes of the output Variables that we computed match the corresponding nodes in the ComputationNetwork
for (auto iter = m_variableToNodeMap.begin(); iter != m_variableToNodeMap.end(); ++iter)
for (auto varNodePair : m_variableToNodeMap)
{
if (iter->first.Kind() == VariableKind::Output)
if (varNodePair.first.IsOutput())
{
auto outputVar = iter->first;
auto outputVar = varNodePair.first;
auto computationNodePtr = m_variableToNodeMap[outputVar];
auto outputShape = outputVar.Shape();
auto computationNodeSampleLayout = computationNodePtr->GetSampleLayout();
@ -310,10 +374,10 @@ namespace CNTK
LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(value->Data()->GetDataType()));
// TODO: Is supplying dense data for an Input variable tagged as sparse, a fatal error?
if (var.IsSparseInput() && !value->Data()->IsSparse())
if (IsSparseInput(var) && !value->Data()->IsSparse())
InvalidArgument("Dense input data supplied for a sparse input Variable");
if (var.IsSparseInput() && (value->Data()->GetStorageFormat() != StorageFormat::SparseCSC))
if (IsSparseInput(var) && (value->Data()->GetStorageFormat() != StorageFormat::SparseCSC))
InvalidArgument("Sparse Input data must be in SparseCSC format");
if (value->Data()->Shape().NumAxes() == var.Shape().NumAxes())
@ -397,7 +461,7 @@ namespace CNTK
layout->GetNumCols(),
AsCNTKImplDeviceId(value->Data()->Device()),
value->Data()->IsSparse() ? MatrixType::SPARSE : MatrixType::DENSE,
AsCNTKMatrixFormat(value->Data()->GetStorageFormat()));
AsCNTKImplMatrixFormat(value->Data()->GetStorageFormat()));
std::vector<size_t> sequencesShorterThanLongestSequence;
for (size_t i = 0; i < numSequences; ++i)
@ -442,8 +506,8 @@ namespace CNTK
{
// Just create a view over the existing matrix itself
auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), AsTensorShape(valueDataShape));
auto data = NDArrayViewPtr(new NDArrayView(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, true, tensorView), [](_ReferenceCounter* ptr) { delete ptr; });
return ValuePtr(new Value(data), [](_ReferenceCounter* ptr) { delete ptr; });
auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, true, tensorView);
return MakeSharedObject<Value>(data);
}
if (layout->GetNumCols() != matrix.GetNumCols())
@ -454,10 +518,10 @@ namespace CNTK
std::vector<size_t> sequenceLengths;
auto& layoutSequences = layout->GetAllSequences();
for (auto iter = layoutSequences.begin(); iter != layoutSequences.end(); ++iter)
for (auto sequenceInfo : layoutSequences)
{
if (iter->seqId != GAP_SEQUENCE_ID)
sequenceLengths.push_back(iter->GetNumTimeSteps());
if (sequenceInfo.seqId != GAP_SEQUENCE_ID)
sequenceLengths.push_back(sequenceInfo.GetNumTimeSteps());
}
// Reshuffle to data to unpack and uninterleave the CNTK form data
@ -473,13 +537,13 @@ namespace CNTK
size_t targetColIdxForInvalidColumns = sequencesShorterThanLongestSequence.empty() ? 0 : (((sequencesShorterThanLongestSequence[0] + 1) * maxNumTimeSteps) - 1);
std::vector<ElementType> scatterIndicesVector(layout->GetNumCols(), (ElementType)targetColIdxForInvalidColumns);
size_t i = 0;
for (auto iter = layoutSequences.begin(); iter != layoutSequences.end(); ++iter)
for (auto sequenceInfo : layoutSequences)
{
if (iter->seqId != GAP_SEQUENCE_ID)
if (sequenceInfo.seqId != GAP_SEQUENCE_ID)
{
size_t targetParallelStreamIdx = iter->s;
size_t targetStartIdxInParallelStream = iter->tBegin;
for (size_t j = 0; j < iter->GetNumTimeSteps(); ++j)
size_t targetParallelStreamIdx = sequenceInfo.s;
size_t targetStartIdxInParallelStream = sequenceInfo.tBegin;
for (size_t j = 0; j < sequenceInfo.GetNumTimeSteps(); ++j)
scatterIndicesVector[((targetStartIdxInParallelStream + j) * layout->GetNumParallelSequences()) + targetParallelStreamIdx] = (ElementType)((i * maxNumTimeSteps) + j);
i++;
@ -493,106 +557,97 @@ namespace CNTK
NDMaskPtr mask;
if (!sequencesShorterThanLongestSequence.empty())
{
mask = NDMaskPtr(new NDMask({ maxNumTimeSteps, numSequences }, AsDeviceDescriptor(matrix.GetDeviceId())), [](_ReferenceCounter* ptr) { delete ptr; });
for (size_t i = 0; i < sequencesShorterThanLongestSequence.size(); ++i)
mask = MakeSharedObject<NDMask>(NDShape({ maxNumTimeSteps, numSequences }), AsDeviceDescriptor(matrix.GetDeviceId()));
for (auto shortSequenceIdx : sequencesShorterThanLongestSequence)
{
size_t shorterSequenceIdx = sequencesShorterThanLongestSequence[i];
mask->MaskSection({ sequenceLengths[shorterSequenceIdx], shorterSequenceIdx }, { NDShape::InferredDimension, 1 });
mask->MaskSection({ sequenceLengths[shortSequenceIdx], shortSequenceIdx }, { NDShape::InferredDimension, 1 });
}
}
auto tensorView = new TensorView<ElementType>(shuffledMatrixData, AsTensorShape(valueDataShape));
auto data = NDArrayViewPtr(new NDArrayView(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, true, tensorView), [](_ReferenceCounter* ptr) { delete ptr; });
return ValuePtr(new Value(data, mask), [](_ReferenceCounter* ptr) { delete ptr; });
auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, true, tensorView);
return MakeSharedObject<Value>(data, mask);
}
void CompositeFunction::PopulateNetworkInputs(const _Internal::_SimpleMap<Variable, const ValuePtr>& arguments)
template <typename ElementType>
/*static*/ void CompositeFunction::PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, ComputationNodeBasePtr& computationNode)
{
auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<ElementType>(variableValue.first, variableValue.second);
MBLayoutPtr layout = CNTKMatrixAndMBLayout.second;
auto& nodeData = computationNode->As<ComputationNode<ElementType>>()->Value();
// Switch the node matrix to the right matrix type
nodeData.SwitchToMatrixType(CNTKMatrixAndMBLayout.first->GetMatrixType(), CNTKMatrixAndMBLayout.first->GetFormat(), false);
nodeData.AssignValuesOf(*CNTKMatrixAndMBLayout.first);
computationNode->GetMBLayout()->CopyFrom(layout);
}
void CompositeFunction::PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments)
{
auto functionArguments = this->Arguments();
std::vector<ComputationNodeBasePtr> inputNodes;
for (auto iter = functionArguments.begin(); iter != functionArguments.end(); ++iter)
for (auto argument : functionArguments)
{
// Ensure we have values for all arguments of the function
if (!arguments.Contains(*iter))
if (arguments.find(argument) == arguments.end())
InvalidArgument("Value not specified for required Function Argument");
auto argumentComputationNode = m_variableToNodeMap[*iter];
auto argumentComputationNode = m_variableToNodeMap[argument];
inputNodes.push_back(argumentComputationNode);
ValuePtr argumentValue = arguments[*iter];
ValuePtr argumentValue = arguments.at(argument);
MBLayoutPtr layout;
switch (argumentValue->Data()->GetDataType())
{
case DataType::Float:
{
auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<float>(*iter, argumentValue);
layout = CNTKMatrixAndMBLayout.second;
auto& nodeData = argumentComputationNode->As<ComputationNode<float>>()->Value();
// Switch the node matrix to the right matrix type
nodeData.SwitchToMatrixType(CNTKMatrixAndMBLayout.first->GetMatrixType(), CNTKMatrixAndMBLayout.first->GetFormat(), false);
nodeData.AssignValuesOf(*CNTKMatrixAndMBLayout.first);
PopulateComputationNodeValue<float>({ argument, argumentValue }, argumentComputationNode);
break;
}
case DataType::Double:
{
auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<double>(*iter, argumentValue);
layout = CNTKMatrixAndMBLayout.second;
auto& nodeData = argumentComputationNode->As<ComputationNode<double>>()->Value();
// Switch the node matrix to the right matrix type
nodeData.SwitchToMatrixType(CNTKMatrixAndMBLayout.first->GetMatrixType(), CNTKMatrixAndMBLayout.first->GetFormat(), false);
nodeData.AssignValuesOf(*CNTKMatrixAndMBLayout.first);
PopulateComputationNodeValue<double>({ argument, argumentValue }, argumentComputationNode);
break;
}
default:
LogicError("Unsupported DataType %s", DataTypeName(argumentValue->Data()->GetDataType()));
break;
}
argumentComputationNode->GetMBLayout()->CopyFrom(layout);
}
m_computationNetwork->BumpEvalTimeStamp(inputNodes);
}
void CompositeFunction::PopulateNetworkGradients(const _Internal::_SimpleMap<Variable, const ValuePtr>& gradients)
template <typename ElementType>
/*static*/ void CompositeFunction::PopulateComputationNodeGradient(const std::pair<Variable, ValuePtr>& variableGradient, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode)
{
auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<ElementType>(variableGradient.first, variableGradient.second);
MBLayoutPtr layout = CNTKMatrixAndMBLayout.second;
auto nodeLayout = computationNode->GetMBLayout();
if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
InvalidArgument("The layout of the specified gradient Value in incompatible with the layout of the corresponding Variable computed during Forward call");
computationNode->As<ComputationNode<ElementType>>()->AssignGradient(*CNTKMatrixAndMBLayout.first);
}
// Assign the supplied gradients corresponding to the root(s) of the network to be backpropagated through the graph
void CompositeFunction::PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients)
{
auto functionOutputs = this->Outputs();
std::unordered_map<Variable, const ValuePtr>& gradientsValueMap = *gradients.m_map;
for (auto iter = gradientsValueMap.begin(); iter != gradientsValueMap.end(); ++iter)
for (auto gradientVarValuePair : gradients)
{
// Only gradients for roots of the function can be specified
if (std::find(functionOutputs.begin(), functionOutputs.end(), iter->first) == functionOutputs.end())
if (std::find(functionOutputs.begin(), functionOutputs.end(), gradientVarValuePair.first) == functionOutputs.end())
InvalidArgument("Gradients cannot be specified for a Variable that is not an Output of the Function");
auto outputComputationNode = m_variableToNodeMap[iter->first];
auto nodeLayout = outputComputationNode->GetMBLayout();
auto outputComputationNode = m_variableToNodeMap[gradientVarValuePair.first];
ValuePtr gradientValue = gradientVarValuePair.second;
ValuePtr gradientValue = iter->second;
MBLayoutPtr layout;
switch (gradientValue->Data()->GetDataType())
{
case DataType::Float:
{
auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<float>(iter->first, gradientValue);
layout = CNTKMatrixAndMBLayout.second;
if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
InvalidArgument("The layout of the specified gradient Value in incompatible with the layout of the corresponding Variable computed during Forward call");
outputComputationNode->As<ComputationNode<float>>()->ResetGradient(*CNTKMatrixAndMBLayout.first);
PopulateComputationNodeGradient<float>(gradientVarValuePair, outputComputationNode);
break;
}
case DataType::Double:
{
auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<double>(iter->first, gradientValue);
layout = CNTKMatrixAndMBLayout.second;
if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
InvalidArgument("The layout of the specified gradient Value in incompatible with the layout of the corresponding Variable computed during Forward call");
outputComputationNode->As<ComputationNode<double>>()->ResetGradient(*CNTKMatrixAndMBLayout.first);
PopulateComputationNodeGradient<double>(gradientVarValuePair, outputComputationNode);
break;
}
default:
LogicError("Unsupported DataType %s", DataTypeName(gradientValue->Data()->GetDataType()));
break;
@ -603,6 +658,8 @@ namespace CNTK
static NDShape GetValueShape(const Variable& var, const ComputationNodeBasePtr& computationNodePtr)
{
size_t outputValueNumAxes = var.Shape().NumAxes();
// Add the batch and dynamic axes if needed
if (computationNodePtr->GetMBLayout() != nullptr)
outputValueNumAxes += 2;
@ -622,12 +679,12 @@ namespace CNTK
void CompositeFunction::GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs)
{
// Now copy the Forward values of output nodes from the network to outputs' Value objects
for (auto iter = outputs.begin(); iter != outputs.end(); ++iter)
for (auto outputVarValuePair : outputs)
{
auto computationNodePtr = m_variableToNodeMap[iter->first];
auto outputValuePtr = iter->second;
auto computationNodePtr = m_variableToNodeMap[outputVarValuePair.first];
auto outputValuePtr = outputVarValuePair.second;
auto outputShape = GetValueShape(iter->first, computationNodePtr);
auto outputShape = GetValueShape(outputVarValuePair.first, computationNodePtr);
if (outputValuePtr != nullptr)
{
// TODO: The shape of the specified output Value object must match the actual output shape
@ -635,38 +692,28 @@ namespace CNTK
InvalidArgument("The shape %s of the specified Value object for output does not match the actual output shape %s", AsString(outputValuePtr->Data()->Shape()).c_str(), AsString(outputShape).c_str());
}
switch (iter->first.GetDataType())
ValuePtr nodeValue;
switch (outputVarValuePair.first.GetDataType())
{
case DataType::Float:
{
auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(iter->first, computationNodePtr->As<ComputationNode<float>>()->Value(), computationNodePtr->GetMBLayout());
if (outputValuePtr == nullptr)
{
auto data = NDArrayViewPtr(new NDArrayView(iter->first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](_ReferenceCounter* ptr) { delete ptr; });
auto mask = (nodeValue->Mask() != nullptr) ? NDMaskPtr(new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()), [](_ReferenceCounter* ptr) { delete ptr; }) : nullptr;
outputValuePtr = ValuePtr(new Value(data, mask), [](_ReferenceCounter* ptr) { delete ptr; });
}
outputValuePtr->CopyFrom(*nodeValue);
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Value(), computationNodePtr->GetMBLayout());
break;
}
case DataType::Double:
{
auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(iter->first, computationNodePtr->As<ComputationNode<double>>()->Value(), computationNodePtr->GetMBLayout());
if (outputValuePtr == nullptr)
{
auto data = NDArrayViewPtr(new NDArrayView(iter->first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](_ReferenceCounter* ptr) { delete ptr; });
auto mask = (nodeValue->Mask() != nullptr) ? NDMaskPtr(new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()), [](_ReferenceCounter* ptr) { delete ptr; }) : nullptr;
outputValuePtr = ValuePtr(new Value(data, mask), [](_ReferenceCounter* ptr) { delete ptr; });
}
outputValuePtr->CopyFrom(*nodeValue);
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Value(), computationNodePtr->GetMBLayout());
break;
}
default:
LogicError("Unsupported DataType %s", DataTypeName(iter->first.GetDataType()));
LogicError("Unsupported DataType %s", DataTypeName(outputVarValuePair.first.GetDataType()));
break;
}
outputs[iter->first] = outputValuePtr;
if (outputValuePtr == nullptr)
{
auto data = MakeSharedObject<NDArrayView>(outputVarValuePair.first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
outputValuePtr = MakeSharedObject<Value>(data, mask);
}
outputValuePtr->CopyFrom(*nodeValue);
outputs[outputVarValuePair.first] = outputValuePtr;
}
}
@ -674,20 +721,20 @@ namespace CNTK
{
auto networkInputs = this->Inputs();
// Now copy the gradient values of input nodes of the network to gradients' Value objects
for (auto iter = gradients.begin(); iter != gradients.end(); ++iter)
for (auto gradientVarValuePair : gradients)
{
// Only gradients corresponding to inputs of the network can be obtained
if (std::find(networkInputs.begin(), networkInputs.end(), iter->first) == networkInputs.end())
if (std::find(networkInputs.begin(), networkInputs.end(), gradientVarValuePair.first) == networkInputs.end())
InvalidArgument("Backpropagated gradient values can only be obtained for inputs of a Function");
// Gradients can only be obtained for parameter variables or input variables that NeedsGradient
if (!iter->first.NeedsGradient())
if (!gradientVarValuePair.first.NeedsGradient())
InvalidArgument("Gradient value incorrectly requested for an Output or Constant Variable, or an Input Variable with NeedsGradient setting of false");
auto computationNodePtr = m_variableToNodeMap[iter->first];
auto gradientValuePtr = iter->second;
auto computationNodePtr = m_variableToNodeMap[gradientVarValuePair.first];
auto gradientValuePtr = gradientVarValuePair.second;
auto gradientShape = GetValueShape(iter->first, computationNodePtr);
auto gradientShape = GetValueShape(gradientVarValuePair.first, computationNodePtr);
if (gradientValuePtr != nullptr)
{
// TODO: The shape of the specified output Value object must match the actual output shape
@ -698,50 +745,40 @@ namespace CNTK
if (!computationNodePtr->NeedsGradient())
LogicError("Backpropagated gradient value cannot be read from a ComputationNode that has NeedsGradient set to false");
switch (iter->first.GetDataType())
ValuePtr nodeValue;
switch (gradientVarValuePair.first.GetDataType())
{
case DataType::Float:
{
auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(iter->first, computationNodePtr->As<ComputationNode<float>>()->Gradient(), computationNodePtr->GetMBLayout());
if (gradientValuePtr == nullptr)
{
auto data = NDArrayViewPtr(new NDArrayView(iter->first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](_ReferenceCounter* ptr) { delete ptr; });
auto mask = NDMaskPtr((nodeValue->Mask() != nullptr) ? new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr, [](_ReferenceCounter* ptr) { delete ptr; });
gradientValuePtr = ValuePtr(new Value(data, mask), [](_ReferenceCounter* ptr) { delete ptr; });
}
gradientValuePtr->CopyFrom(*nodeValue);
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Gradient(), computationNodePtr->GetMBLayout());
break;
}
case DataType::Double:
{
auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(iter->first, computationNodePtr->As<ComputationNode<double>>()->Gradient(), computationNodePtr->GetMBLayout());
if (gradientValuePtr == nullptr)
{
auto data = NDArrayViewPtr(new NDArrayView(iter->first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](_ReferenceCounter* ptr) { delete ptr; });
auto mask = NDMaskPtr((nodeValue->Mask() != nullptr) ? new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr, [](_ReferenceCounter* ptr) { delete ptr; });
gradientValuePtr = ValuePtr(new Value(data, mask), [](_ReferenceCounter* ptr) { delete ptr; });
}
gradientValuePtr->CopyFrom(*nodeValue);
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Gradient(), computationNodePtr->GetMBLayout());
break;
}
default:
LogicError("Unsupported DataType %s", DataTypeName(iter->first.GetDataType()));
LogicError("Unsupported DataType %s", DataTypeName(gradientVarValuePair.first.GetDataType()));
break;
}
gradients[iter->first] = gradientValuePtr;
if (gradientValuePtr == nullptr)
{
auto data = MakeSharedObject<NDArrayView>(gradientVarValuePair.first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
gradientValuePtr = MakeSharedObject<Value>(data, mask);
}
gradientValuePtr->CopyFrom(*nodeValue);
gradients[gradientVarValuePair.first] = gradientValuePtr;
}
}
/*virtual*/ BackPropStatePtr CompositeFunction::Forward(const _Internal::_SimpleMap<Variable, const ValuePtr>& arguments,
_Internal::_SimpleMap<Variable, ValuePtr>& outputs,
const _Internal::_SimpleSet<Variable>& outputsToRetainBackwardStateFor,
const DeviceDescriptor& computeDevice)
/*virtual*/ BackPropStatePtr CompositeFunction::Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
std::unordered_map<Variable, ValuePtr>& outputs,
const DeviceDescriptor& computeDevice,
const std::unordered_set<Variable>& outputsToRetainBackwardStateFor)
{
// TODO: How about zero argument functions?
// TODO: We need a better way to determine the ElementType for the network
auto dataType = arguments.m_map->begin()->second->Data()->GetDataType();
auto dataType = arguments.begin()->second->Data()->GetDataType();
if (dataType == DataType::Float)
GetComputationNetwork<float>(computeDevice, outputsToRetainBackwardStateFor);
else
@ -752,140 +789,119 @@ namespace CNTK
// Feed data into the arguments of the network
PopulateNetworkInputs(arguments);
std::unordered_set<Variable> functionOutputs = _Internal::_SimpleVector<Variable>::CreateSimpleVector(this->Outputs()).GetAsUnorderedSet();
std::unordered_set<Variable> functionOutputs(this->Outputs().begin(), this->Outputs().end());
std::vector<ComputationNodeBasePtr> outputsToEvaluate;
for (auto iter = outputs.m_map->begin(); iter != outputs.m_map->end(); ++iter)
for (auto outputVarValuePair : outputs)
{
// Ensure that only a subset of this function's outputs are being asked to be evaluated
if (functionOutputs.find(iter->first) == functionOutputs.end())
if (functionOutputs.find(outputVarValuePair.first) == functionOutputs.end())
InvalidArgument("Requested output is not an Ouptut of the Function");
auto outputComputationNode = m_variableToNodeMap[iter->first];
auto outputComputationNode = m_variableToNodeMap[outputVarValuePair.first];
outputsToEvaluate.push_back(outputComputationNode);
}
// The 'outputsToRetainBackwardStateFor' nodes also need to be evaluated if not already specified in 'outputs'
for (auto iter = outputsToRetainBackwardStateFor.m_set->begin(); iter != outputsToRetainBackwardStateFor.m_set->end(); ++iter)
for (auto rootVarForBackprop : outputsToRetainBackwardStateFor)
{
if (outputs.m_map->find(*iter) == outputs.m_map->end())
outputsToEvaluate.push_back(m_variableToNodeMap[*iter]);
if (outputs.find(rootVarForBackprop) == outputs.end())
outputsToEvaluate.push_back(m_variableToNodeMap[rootVarForBackprop]);
}
m_computationNetwork->ForwardProp(outputsToEvaluate);
GetNetworkOutputs(*(outputs.m_map));
GetNetworkOutputs(outputs);
// TODO: How to deal with the specified 'computeDevice'
return (outputsToRetainBackwardStateFor.Size() > 0) ? BackPropStatePtr(new CNTKBackPropState(this, { arguments.m_map->begin()->first, m_variableToNodeMap[arguments.m_map->begin()->first]->GetEvalTimeStamp() }), [](_ReferenceCounter* ptr) { delete ptr; }) : nullptr;
return (outputsToRetainBackwardStateFor.size() > 0) ? MakeSharedObject<CNTKBackPropState>(this->shared_from_this(), std::make_pair(arguments.begin()->first, m_variableToNodeMap[arguments.begin()->first]->GetEvalTimeStamp())) : nullptr;
}
/*virtual*/ void CompositeFunction::Backward(const BackPropStatePtr& state,
const _Internal::_SimpleMap<Variable, const ValuePtr>& rootGradientValues,
_Internal::_SimpleMap<Variable, ValuePtr>& backPropagatedGradientValuesForInputs)
const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs)
{
if ((state == nullptr) || (dynamic_cast<const CNTKBackPropState*>(state.GetPtr()) == nullptr))
auto backpropState = dynamic_cast<const CNTKBackPropState*>(state.get());
if (backpropState == nullptr)
InvalidArgument("Invalid backprop state specified");
// TODO: Support multiple concurrent backprop states
auto backpropState = dynamic_cast<const CNTKBackPropState*>(state.GetPtr());
if (backpropState->EvalTimeStamp().second != m_variableToNodeMap[backpropState->EvalTimeStamp().first]->GetEvalTimeStamp())
LogicError("The specified backprop state specified cannot be used for backpropagation as the Function's internal state was modified by subsequent Forward calls to the function."
"This is not a user error but a shortcoming of the current implementation where multiple independent backprop states are not simultaneously supported");
if (rootGradientValues.Size() > 1)
if (rootGradientValues.size() > 1)
LogicError("Currently gradient backprop from only one of the Function Outputs is supported");
// TODO: Avoid copying the data when possible
// Zero all gradients of nodes below the root nodes
for (auto iter = rootGradientValues.m_map->begin(); iter != rootGradientValues.m_map->end(); ++iter)
m_computationNetwork->ZeroInputGradients(m_variableToNodeMap[iter->first]);
for (auto rootGradientVarValuePair : rootGradientValues)
m_computationNetwork->ZeroInputGradients(m_variableToNodeMap[rootGradientVarValuePair.first]);
// Feed data into the arguments of the network
PopulateNetworkGradients(rootGradientValues);
// Backpropagate through the network
auto rootComputationNodePtr = m_variableToNodeMap[rootGradientValues.m_map->begin()->first];
auto rootComputationNodePtr = m_variableToNodeMap[rootGradientValues.begin()->first];
m_computationNetwork->GetNestedNetwork(rootComputationNodePtr)->Backprop(FrameRange(nullptr), true, true);
GetNetworkGradients(*(backPropagatedGradientValuesForInputs.m_map));
GetNetworkGradients(backPropagatedGradientValuesForInputs);
// TODO: How to deal with the specified 'computeDevice'
}
/*virtual*/ void CompositeFunction::_ReplacePlaceholders(const _Internal::_SimpleMap<Placeholder, Variable>& placeholderReplacements, _Internal::_SimpleSet<const Function*>& visitedFunctions, _Internal::_SimpleSet<Placeholder>& replacedPlaceholders)
{
RootFunction()->_ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
// If any of the placeholders were replaced with Output variables, let's add the graph of function underneath each of those to 'm_allPrimitiveFunctions' set
for (auto iter = replacedPlaceholders.m_set->begin(); iter != replacedPlaceholders.m_set->end(); ++iter)
{
auto replacingVariable = placeholderReplacements[*iter];
if (replacingVariable.Kind() == VariableKind::Output)
{
auto ownerFunc = replacingVariable.Owner();
_Internal::_SimpleSet<FunctionPtr> visitedFunctions;
_DetermineInputs(ownerFunc, visitedFunctions);
// Add the newly visited functions to 'm_allPrimitiveFunctions' set
m_allPrimitiveFunctions.m_set->insert(visitedFunctions.m_set->begin(), visitedFunctions.m_set->end());
}
}
}
FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Times, { leftOperand, rightOperand }, Dictionary(), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Times, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
}
FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Plus, { leftOperand, rightOperand }, Dictionary(), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Plus, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
}
FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Sigmoid, { operand }, Dictionary(), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Sigmoid, std::vector<Variable>({ operand }), Dictionary(), name), name);
}
FunctionPtr Tanh(const Variable& operand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Tanh, { operand }, Dictionary(), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Tanh, std::vector<Variable>({ operand }), Dictionary(), name), name);
}
FunctionPtr _Combine(const _Internal::_SimpleVector<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
FunctionPtr Combine(const std::initializer_list<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
{
_Internal::_SimpleSet<FunctionPtr> uniqueOperands;
std::unordered_set<FunctionPtr> uniqueOperands;
std::vector<Variable> inputs;
for (size_t i = 0; i < operands.Size(); ++i)
for (auto operand : operands)
{
if (uniqueOperands.Contains(operands[i]))
if (uniqueOperands.find(operand) != uniqueOperands.end())
LogicError("All function operands specified to Combine must be unique");
uniqueOperands.Insert(operands[i]);
auto currentFunctionOutputs = operands[i]->Outputs();
uniqueOperands.insert(operand);
auto currentFunctionOutputs = operand->Outputs();
std::copy(currentFunctionOutputs.begin(), currentFunctionOutputs.end(), std::back_inserter(inputs));
}
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Combine, inputs, Dictionary(), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Combine, inputs, Dictionary(), name), name);
}
FunctionPtr CrossEntropyWithSoftmax(const Variable& output, const Variable& labels, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::CrossEntropyWithSoftmax, { output, labels }, Dictionary(), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::CrossEntropyWithSoftmax, std::vector<Variable>({ output, labels }), Dictionary(), name), name);
}
FunctionPtr PredictionError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::PredictionError, { prediction, labels }, Dictionary(), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ClassificationError, std::vector<Variable>({ prediction, labels }), Dictionary(), name), name);
}
FunctionPtr Exp(const Variable& operand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Exp, { operand }, Dictionary(), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Exp, std::vector<Variable>({ operand }), Dictionary(), name), name);
}
FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
@ -895,7 +911,7 @@ namespace CNTK
auto additionalProperties = Dictionary();
additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::PastValue, { initialState, operand }, std::move(additionalProperties), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::PastValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
}
FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
@ -905,16 +921,16 @@ namespace CNTK
auto additionalProperties = Dictionary();
additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::FutureValue, { initialState, operand }, std::move(additionalProperties), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::FutureValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
}
FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::ElementTimes, { leftOperand, rightOperand }, Dictionary(), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ElementTimes, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
}
FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::ReduceSum, { operand }, Dictionary(), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ReduceSum, std::vector<Variable>({ operand }), Dictionary(), name), name);
}
}

Просмотреть файл

@ -3,6 +3,8 @@
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "stdafx.h"
#include "CNTKLibrary.h"
#include <iterator>
@ -19,7 +21,7 @@ namespace CNTK
Tanh,
Combine,
CrossEntropyWithSoftmax,
PredictionError,
ClassificationError,
Exp,
PastValue,
FutureValue,
@ -29,6 +31,7 @@ namespace CNTK
inline const char* PrimitiveOpTypeName(PrimitiveOpType opType)
{
// TODO: Put these in table form
if (opType == PrimitiveOpType::Plus)
return "Plus";
else if (opType == PrimitiveOpType::Times)
@ -41,8 +44,8 @@ namespace CNTK
return "Combine";
else if (opType == PrimitiveOpType::CrossEntropyWithSoftmax)
return "CrossEntropyWithSoftmax";
else if (opType == PrimitiveOpType::PredictionError)
return "PredictionError";
else if (opType == PrimitiveOpType::ClassificationError)
return "ClassificationError";
else if (opType == PrimitiveOpType::Exp)
return "Exp";
else if (opType == PrimitiveOpType::PastValue)
@ -65,17 +68,17 @@ namespace CNTK
{
}
virtual BackPropStatePtr Forward(const _Internal::_SimpleMap<Variable, const ValuePtr>& /*arguments*/,
_Internal::_SimpleMap<Variable, ValuePtr>& /*outputs*/,
const _Internal::_SimpleSet<Variable>& /*outputsToRetainBackwardStateFor*/,
const DeviceDescriptor& /*computeDevice*/) override
virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& /*arguments*/,
std::unordered_map<Variable, ValuePtr>& /*outputs*/,
const DeviceDescriptor& /*computeDevice*/,
const std::unordered_set<Variable>& /*outputsToRetainBackwardStateFor*/) override
{
NOT_IMPLEMENTED;
}
virtual void Backward(const BackPropStatePtr& /*state*/,
const _Internal::_SimpleMap<Variable, const ValuePtr>& /*rootGradientValues*/,
_Internal::_SimpleMap<Variable, ValuePtr>& /*backPropagatedGradientValuesForInputs*/) override
const std::unordered_map<Variable, const ValuePtr>& /*rootGradientValues*/,
std::unordered_map<Variable, ValuePtr>& /*backPropagatedGradientValuesForInputs*/) override
{
NOT_IMPLEMENTED;
}
@ -91,6 +94,8 @@ namespace CNTK
}
private:
// The following helper functions are used to determine the output shape for different
// types of primitive operations accounting for broadcasting and reductions where applicable.
static NDShape UnaryElementwiseOpOutputShape(const NDShape& operandShape)
{
return operandShape;
@ -98,17 +103,17 @@ namespace CNTK
static NDShape BinaryElementwiseOpOutputShape(PrimitiveOpType op, const NDShape& leftOperandShape, const NDShape& rightOperandShape, bool broadcastAllowed = true)
{
auto& shapeWithSmallerNumAxes = (leftOperandShape.NumAxes() > rightOperandShape.NumAxes()) ? rightOperandShape : leftOperandShape;
auto& shapeWithLargerNumAxes = (leftOperandShape.NumAxes() > rightOperandShape.NumAxes()) ? leftOperandShape : rightOperandShape;
const auto& shapeWithSmallerNumAxes = (leftOperandShape.NumAxes() > rightOperandShape.NumAxes()) ? rightOperandShape : leftOperandShape;
const auto& shapeWithLargerNumAxes = (leftOperandShape.NumAxes() > rightOperandShape.NumAxes()) ? leftOperandShape : rightOperandShape;
size_t numOutputAxes = shapeWithLargerNumAxes.NumAxes();
std::vector<size_t> outputDims(numOutputAxes);
for (size_t i = 0; i < shapeWithSmallerNumAxes.NumAxes(); ++i)
{
if ((leftOperandShape[i] == NDShape::InferredDimension) && (rightOperandShape[i] == NDShape::InferredDimension))
outputDims[i] = NDShape::InferredDimension;
else if ((leftOperandShape[i] == NDShape::InferredDimension) && (rightOperandShape[i] != NDShape::InferredDimension))
else if (leftOperandShape[i] == NDShape::InferredDimension)
outputDims[i] = rightOperandShape[i];
else if ((leftOperandShape[i] != NDShape::InferredDimension) && (rightOperandShape[i] == NDShape::InferredDimension))
else if (rightOperandShape[i] == NDShape::InferredDimension)
outputDims[i] = leftOperandShape[i];
else
{
@ -126,7 +131,7 @@ namespace CNTK
return NDShape(std::move(outputDims));
}
static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape, bool broadcastAllowed = true)
static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape)
{
if (rightOperandShape.NumAxes() > 2)
RuntimeError("The right operand of a times operation can have at most 2 axes");
@ -166,6 +171,7 @@ namespace CNTK
return NDShape(std::move(outputDims));
}
// TODO: Reconcile this with the ComputationNode::Validate functionality in core CNTK to avoid duplication of inference logic
static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner)
{
std::vector<Variable> outputs;
@ -175,9 +181,9 @@ namespace CNTK
// We currently require that the inputs' dynamic axes if any match
std::vector<Axis> outputDynamicAxes = inputs[0].DynamicAxes();
for (size_t i = 1; i < inputs.size(); ++i)
for (auto inputVar : inputs)
{
auto currentInputDynamicAxes = inputs[i].DynamicAxes();
auto currentInputDynamicAxes = inputVar.DynamicAxes();
if (outputDynamicAxes.empty())
outputDynamicAxes = currentInputDynamicAxes;
else
@ -210,7 +216,7 @@ namespace CNTK
outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
break;
case PrimitiveOpType::CrossEntropyWithSoftmax:
case PrimitiveOpType::PredictionError:
case PrimitiveOpType::ClassificationError:
{
assert(inputs.size() == 2);
@ -274,65 +280,69 @@ namespace CNTK
private:
std::pair<Variable, int64_t> m_evalTimeStamp;
};
typedef _Internal::_ReferenceCounterSharedPtr<CNTKBackPropState> CNTKBackPropStatePtr;
typedef std::shared_ptr<CNTKBackPropState> CNTKBackPropStatePtr;
class CompositeFunction;
typedef _Internal::_ReferenceCounterSharedPtr<CompositeFunction> CompositeFunctionPtr;
typedef std::shared_ptr<CompositeFunction> CompositeFunctionPtr;
class CompositeFunction final : public Function
{
friend class Function;
template <typename T, typename ...CtorArgTypes>
friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
public:
static CompositeFunctionPtr Create(const FunctionPtr& rootFunction, const std::wstring& name = L"")
{
_Internal::_SimpleSet<FunctionPtr> visitedFunctions;
std::unordered_set<FunctionPtr> visitedFunctions;
// Call _DetermineInputs to get the set of all functions in the graph
_DetermineInputs(rootFunction, visitedFunctions);
// Call DetermineInputs to get the set of all functions in the graph
DetermineInputs(rootFunction, visitedFunctions);
auto func = new CompositeFunction(rootFunction, std::move(visitedFunctions), name);
return CompositeFunctionPtr(func, [](_ReferenceCounter* ptr) { delete ptr; });
return MakeSharedObject<CompositeFunction>(rootFunction, std::move(visitedFunctions), name);
}
virtual BackPropStatePtr Forward(const _Internal::_SimpleMap<Variable, const ValuePtr>& arguments,
_Internal::_SimpleMap<Variable, ValuePtr>& outputs,
const _Internal::_SimpleSet<Variable>& outputsToRetainBackwardStateFor,
const DeviceDescriptor& computeDevice) override;
virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
std::unordered_map<Variable, ValuePtr>& outputs,
const DeviceDescriptor& computeDevice,
const std::unordered_set<Variable>& outputsToRetainBackwardStateFor) override;
virtual void Backward(const BackPropStatePtr& state,
const _Internal::_SimpleMap<Variable, const ValuePtr>& rootGradientValues,
_Internal::_SimpleMap<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override;
const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override;
private:
virtual void _ReplacePlaceholders(const _Internal::_SimpleMap<Placeholder, Variable>& placeholderReplacements, _Internal::_SimpleSet<const Function*>& visitedFunctions, _Internal::_SimpleSet<Placeholder>& replacedPlaceholders) override;
virtual void ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements,
std::unordered_set<const Function*>& visitedFunctions,
std::unordered_set<Placeholder>& replacedPlaceholders) override;
CompositeFunction(const FunctionPtr& rootFunction, _Internal::_SimpleSet<FunctionPtr>&& allPrimitiveFunctions, const std::wstring& name)
CompositeFunction(const FunctionPtr& rootFunction, std::unordered_set<FunctionPtr>&& allPrimitiveFunctions, const std::wstring& name)
: Function({}, rootFunction->Outputs(), rootFunction, name), m_allPrimitiveFunctions(std::move(allPrimitiveFunctions))
{
}
std::vector<Variable> DetermineInputs() const
{
_Internal::_SimpleSet<FunctionPtr> visitedFunctions;
return _DetermineInputs(RootFunction(), visitedFunctions);
std::unordered_set<FunctionPtr> visitedFunctions;
return DetermineInputs(RootFunction(), visitedFunctions);
}
static std::vector<Variable> _DetermineInputs(const FunctionPtr& rootFunction, _Internal::_SimpleSet<FunctionPtr>& visitedFunctions)
// Recursively traverses the Function graph underlying the 'rootFunction' to determine all the leaves (aka inputs) of the graph
static std::vector<Variable> DetermineInputs(const FunctionPtr& rootFunction, std::unordered_set<FunctionPtr>& visitedFunctions)
{
visitedFunctions.Insert(rootFunction);
visitedFunctions.insert(rootFunction);
std::vector<Variable> inputs;
std::vector<Variable> rootFunctionInputs = rootFunction->Inputs();
for (size_t i = 0; i < rootFunctionInputs.size(); ++i)
for (auto rootInput : rootFunctionInputs)
{
Variable currentInput = rootFunctionInputs[i];
if (currentInput.Kind() != VariableKind::Output)
inputs.push_back(currentInput);
else if (!visitedFunctions.Contains(currentInput.Owner()))
if (!rootInput.IsOutput())
inputs.push_back(rootInput);
else if (visitedFunctions.find(rootInput.Owner()) == visitedFunctions.end())
{
FunctionPtr function = currentInput.Owner();
std::vector<Variable> functionInputs = _DetermineInputs(function, visitedFunctions);
FunctionPtr function = rootInput.Owner();
std::vector<Variable> functionInputs = DetermineInputs(function, visitedFunctions);
std::copy(functionInputs.begin(), functionInputs.end(), std::back_inserter(inputs));
}
}
@ -341,7 +351,7 @@ namespace CNTK
}
template <typename ElementType>
Microsoft::MSR::CNTK::ComputationNetworkPtr GetComputationNetwork(const DeviceDescriptor& device, const _Internal::_SimpleSet<Variable>& backpropRoots);
Microsoft::MSR::CNTK::ComputationNetworkPtr GetComputationNetwork(const DeviceDescriptor& device, const std::unordered_set<Variable>& backpropRoots);
template <typename ElementType>
static Microsoft::MSR::CNTK::ComputationNodeBasePtr GetOutputVariableNode(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkPtr& network, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr>& variableToNodeMap, std::unordered_map<Variable, bool>& isVariableRootMap);
@ -349,8 +359,13 @@ namespace CNTK
template <typename ElementType>
static Microsoft::MSR::CNTK::ComputationNodeBasePtr GetNode(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkPtr& network, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr>& variableToNodeMap, std::unordered_map<Variable, bool>& isVariableRootMap);
void PopulateNetworkInputs(const _Internal::_SimpleMap<Variable, const ValuePtr>& arguments);
void PopulateNetworkGradients(const _Internal::_SimpleMap<Variable, const ValuePtr>& gradients);
template <typename ElementType>
static void PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
void PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments);
template <typename ElementType>
static void PopulateComputationNodeGradient(const std::pair<Variable, ValuePtr>& variableGradient, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
void PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients);
void GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs);
void GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients);
@ -362,10 +377,23 @@ namespace CNTK
static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout);
private:
_Internal::_SimpleSet<FunctionPtr> m_allPrimitiveFunctions;
// Set of all primitive functions in the graph underlying 'this' Function. Also keeps the primitive Function objects alive
// by holding strong references to them
std::unordered_set<FunctionPtr> m_allPrimitiveFunctions;
// A map from Variable objects to ComputationNode objects in the ComputationNetwork instance that implements 'this' Composite Function
std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr> m_variableToNodeMap;
// A map that tells whether a Variable in the graph underlying 'this' Function is a root of the graph
std::unordered_map<Variable, bool> m_isVariableRootMap;
Microsoft::MSR::CNTK::ComputationNetworkPtr m_computationNetwork;
// The backpropRoots sepecified in the most recent 'Forward' call on 'this' Function.
// This indicates for which of it's roots has 'this' Function retained required intermediate
// states from the previos Forward call to be able to backpropagate gradients backwards from in
// the next 'Backward' call.
std::unordered_set<Variable> m_currentBackpropRoots;
};
}

Просмотреть файл

@ -0,0 +1,464 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "Learner.h"
#include "TensorView.h"
#include "Utils.h"
#define UPDATE_FUNCTION \
switch (smoothedGradientValue->Data()->GetDataType()) \
{ \
case DataType::Float: \
Update<float>(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount); \
break; \
case DataType::Double: \
Update<double>(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount); \
break; \
default: \
NOT_IMPLEMENTED; \
}
using namespace Microsoft::MSR::CNTK;
using namespace std;
namespace CNTK
{
template <typename ElementType>
/*static*/ shared_ptr<const Matrix<ElementType>> LearnerBase::GetMatrix(const NDArrayViewPtr arrayView)
{
return arrayView->GetMatrix<ElementType>();
}
template <typename ElementType>
/*static*/ shared_ptr<Matrix<ElementType>> LearnerBase::GetWritableMatrix(NDArrayViewPtr arrayView)
{
return arrayView->GetWritableMatrix<ElementType>();
}
template <typename ElementType>
/*static*/ const TensorView<ElementType>* LearnerBase::GetTensorView(const NDArrayViewPtr arrayView)
{
return arrayView->GetTensorView<ElementType>();
}
/*static*/ bool LearnerBase::HasNan(const ValuePtr& value, const char* name)
{
const auto& data = value->Data();
switch (data->GetDataType())
{
case DataType::Float:
return data->GetMatrix<float>()->HasNan(name);
case DataType::Double:
return data->GetMatrix<double>()->HasNan(name);
default:
LogicError("Unsupported DataType %s", DataTypeName(data->GetDataType()));
}
}
/*static*/ void LearnerBase::Print(const ValuePtr& value, const char* msg)
{
const auto& data = value->Data();
switch (data->GetDataType())
{
case DataType::Float:
data->GetMatrix<float>()->Print(msg);
break;
case DataType::Double:
data->GetMatrix<double>()->Print(msg);
break;
default:
LogicError("Unsupported DataType %s", DataTypeName(data->GetDataType()));
}
}
// Clipping gradients to prevent outliers,
template <typename ElementType>
void LearnerBase::ClipGradient(Matrix<ElementType>& gradient, size_t actualMBSize) const
{
if (m_additionalOptions.gradientClippingThresholdPerSample != numeric_limits<double>::infinity())
{
double maxGradientPerMB = m_additionalOptions.gradientClippingThresholdPerSample * actualMBSize;
if (m_additionalOptions.gradientClippingWithTruncation)
gradient.InplaceTruncate(ElementType(maxGradientPerMB));
else
{
// norm2 normalized
double gradientNorm = gradient.FrobeniusNorm();
if (gradientNorm > maxGradientPerMB)
{
double normFactor = maxGradientPerMB / gradientNorm;
gradient *= ElementType(normFactor);
}
}
}
}
// Performs additional preprocessing before calling the update method
// (gradient clipping and L2 regularization depending on the additional learning parameters).
template <typename ElementType>
void LearnerBase::PreProcess(const ValuePtr& gradientValue,const ValuePtr& parameterValue, size_t actualMBSize) const
{
const auto& gradientMatrix = gradientValue->Data()->GetWritableMatrix<ElementType>();
// clipping gradients to prevent outliers
ClipGradient<ElementType>(*gradientMatrix, actualMBSize);
// L2 regularizer
if (m_additionalOptions.l2RegularizationWeight > 0)
{
// multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
auto weight = ElementType(m_additionalOptions.l2RegularizationWeight * actualMBSize);
const auto& parameterMatrix = parameterValue->Data()->GetWritableMatrix<ElementType>();
Matrix<ElementType>::ScaleAndAdd(weight, *parameterMatrix, *gradientMatrix);
}
}
// Performs additional postprocessing after the update method has been executed
// (noise injection and L1 regularization specified by the additional learning parameters).
template <typename ElementType>
void LearnerBase::PostProcess(const Variable& parameter, const ValuePtr& gradientValue,
const ValuePtr& parameterValue, size_t actualMBSize) const
{
const auto& parameterMatrix = parameterValue->Data()->GetWritableMatrix<ElementType>();
if (m_additionalOptions.gaussianNoiseInjectionStdDev > 0)
{
const auto& gradientMatrix = gradientValue->Data()->GetWritableMatrix<ElementType>();
Matrix<ElementType> sgdUpdateNoise((DEVICEID_TYPE)parameterMatrix->GetDeviceId());
// get the gradient structure since gradient is sparse
sgdUpdateNoise.SetValue(*gradientMatrix);
auto noiseStdDev = ElementType(m_additionalOptions.gaussianNoiseInjectionStdDev);
// reset its value to random
sgdUpdateNoise.SetGaussianRandomValue(ElementType(0.0), noiseStdDev);
Matrix<ElementType>::ScaleAndAdd(ElementType(1.0), sgdUpdateNoise, *parameterMatrix);
}
// L1 regularizer with proximal gradient descent method
if (m_additionalOptions.l1RegularizationWeight > 0)
{
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
// multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
auto weight = ElementType(learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize);
parameterValue->Data()->GetWritableMatrix<ElementType>()->InplaceSoftThreshold(weight);
}
}
template <typename ElementType>
/*static*/ TensorView<ElementType>* LearnerBase::GetWritableTensorView(NDArrayViewPtr arrayView)
{
return arrayView->GetWritableTensorView<ElementType>();
}
LearnerBase::LearnerBase(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
: Learner(parameters),
m_learningRatePerSample(0.0),
m_sampleCount(0)
{
const unordered_set<Variable>& parameterSet = parameters;
for (const auto& parameter : parameterSet)
{
// TODO: using the same device to allocate data for all smoothed gradients. Is this correct?
// Should the device be specified on the per-parameter basis?
NDArrayViewPtr view;
if (parameter.GetDataType() == DataType::Float)
{
view = MakeSharedObject<NDArrayView>(0.0f, parameter.Shape(), device);
}
else
{
view = MakeSharedObject<NDArrayView>(0.0, parameter.Shape(), device);
}
m_smoothedGradientValues.insert(make_pair(parameter, MakeSharedObject<Value>(view)));
m_additionalOptions.learningRateMultipliers.insert(make_pair(parameter, 1.0));
}
}
void LearnerBase::ResetSmoothedGradients()
{
for (const auto& parameter : Parameters())
{
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
const auto& data = smoothedGradientValue->Data();
switch (data->GetDataType())
{
case DataType::Float:
data->SetValue(0.0f);
break;
case DataType::Double:
data->SetValue(0.0);
break;
default:
LogicError("Unsupported DataType %s", ::CNTK::DataTypeName(data->GetDataType()));
}
}
}
/*virtual*/ bool LearnerBase::Update(const unordered_map<Variable, ValuePtr>& parameterValues,
const unordered_map<Variable, const ValuePtr>& gradientValues,
size_t trainingSampleCount) /*override*/
{
// make sure trainingSampleCount is a valid value
assert(trainingSampleCount > 0);
for (const auto& parameter : Parameters())
{
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
const auto& gradientValue = gradientValues.at(parameter);
const auto& parameterValue = parameterValues.at(parameter);
// TODO: make this a runtime parameter.
#if DUMPOUTPUT
LOGPRINTF(stderr, "Update_%ls\n", parameter.Name().c_str());
#endif
#ifdef _DEBUG
if (HasNan(smoothedGradientValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
LogicError("%ls has NaNs in smoothedGradient.", parameter.Name().c_str());
#endif
#if DUMPOUTPUT
LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
m_learningRatePerSample, m_momentumPerSample, trainingSampleCount);
LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n",
LearnerType().c_str(), m_GaussianNoiseInjectStd);
Print(gradientValue, "Gradient Update");
Print(smoothedGradientValue, "Smoothed Gradient Input");
#endif
UPDATE_FUNCTION;
#if DUMPOUTPUT
Print(parameterValue, "Parameter Update");
#endif
#ifdef _DEBUG
if (HasNan(parameterValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
LogicError("%ls has NaNs in parameter values after parameter update.", parameter.Name().c_str());
#endif
}
m_sampleCount += trainingSampleCount;
return false;
}
template <typename ElementType>
void LearnerBase::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
{
PreProcess<ElementType>(gradientValue, parameterValue, trainingSampleCount);
Update(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount);
PostProcess<ElementType>(parameter, gradientValue, parameterValue, trainingSampleCount);
}
string LearnerBase::LearnerType() const
{
auto name = typeid(*this).name();
if (strncmp(name, "class ", 6) == 0)
{
// On Windows, the type name contains "class" prefix.
// Return the actual name, omitting the prefix.
return &name[6];
}
return name;
}
/*virtual*/ Dictionary LearnerBase::GetCheckpointState() const /*override*/
{
NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
Dictionary checkpoint;
for (const auto& parameter : Parameters())
{
// TODO: parameter name is not guaranteed to be unique. Instead, all serializable objects
// need to expose "UId" property -- a persistent unique internal name.
// Switch to UId as soon as it's available.
if (checkpoint.Contains(parameter.Name()))
{
LogicError("Parameter names must be unique");
}
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
// Potentially, could store things like dimensions, element size, format, etc., but
// that seems to be redundant, since all of that is passed in the constructor.
checkpoint[parameter.Name()] = SerializeToVector(smoothedGradientValue->Data());
}
return checkpoint;
}
/*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/
{
NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
for (const auto& parameter : Parameters())
{
if (!checkpoint.Contains(parameter.Name()))
{
LogicError("Checkpoint does not contain state for parameter %ls", parameter.Name().c_str());
}
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
const DictionaryValue& state = checkpoint[parameter.Name()];
const auto& data = smoothedGradientValue->Data();
DeserializeFromVector(data, state.GetValue<vector<DictionaryValue>>());
}
}
/*virtual*/ void LearnerSGD::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
{
UPDATE_FUNCTION;
}
template <typename ElementType>
void LearnerSGD::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
{
UNUSED(trainingSampleCount);
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
const auto& learningRate = ElementType(ParameterDependentLearningRate(parameter));
// TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
// (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
learningRate, ElementType(m_momentumPerSample), m_useNesterovAcceleration);
}
LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Variable>& parameters, bool needAveMultiplier, const DeviceDescriptor& device)
: LearnerBase(parameters, device),
m_needAveMultiplier(needAveMultiplier)
{
}
/*virtual*/ void LearnerAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
{
UPDATE_FUNCTION;
}
template <typename ElementType>
void LearnerAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
{
UNUSED(trainingSampleCount);
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier);
Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
}
LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
: LearnerMomentumSGD(parameters, device)
{
}
/*virtual*/ void LearnerFSAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
{
UPDATE_FUNCTION;
}
template <typename ElementType>
void LearnerFSAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
{
UNUSED(trainingSampleCount);
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
//const double momentum = MomentumPerMB(m_momentumPerSample, trainingSampleCount);
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix,
learningRate, ElementType(m_momentumPerSample));
}
LearnerRMSProp::LearnerRMSProp(const unordered_set<Variable>& parameters,
double gamma, double inc, double dec, double max, double min,
bool needAveMultiplier, const DeviceDescriptor& device)
: LearnerBase(parameters, device),
m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
m_needAveMultiplier(needAveMultiplier)
{
}
/*virtual*/ void LearnerRMSProp::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
{
UPDATE_FUNCTION;
}
template <typename ElementType>
void LearnerRMSProp::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
{
UNUSED(trainingSampleCount);
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix,
ElementType(m_gamma), ElementType(m_inc),
ElementType(m_max), ElementType(m_dec),
ElementType(m_min), m_needAveMultiplier);
Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
}
// Explicit template instantiations
template shared_ptr<Matrix<float>> LearnerBase::GetWritableMatrix<float>(const NDArrayViewPtr arrayView);
template shared_ptr<Matrix<double>> LearnerBase::GetWritableMatrix<double>(const NDArrayViewPtr arrayView);
LearnerPtr SGDLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
{
return MakeSharedObject<LearnerSGD>(parameters, device);
}
LearnerPtr MomentumSGDLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
{
return MakeSharedObject<LearnerMomentumSGD>(parameters, device);
}
LearnerPtr NesterovLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
{
return MakeSharedObject<LearnerNesterov>(parameters, device);
}
LearnerPtr AdaGradLearner(const unordered_set<Variable>& parameters, bool needAveMultiplier, const DeviceDescriptor& device)
{
return MakeSharedObject<LearnerAdaGrad>(parameters, needAveMultiplier, device);
}
LearnerPtr FSAdaGradLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
{
return MakeSharedObject<LearnerFSAdaGrad>(parameters, device);
}
LearnerPtr RMSPropLearner(const unordered_set<Variable>& parameters,
double gamma, double inc, double dec, double max, double min, bool needAveMultiplier,
const DeviceDescriptor& device)
{
return MakeSharedObject<LearnerRMSProp>(parameters, gamma, inc, dec, max, min, needAveMultiplier, device);
}
}

Просмотреть файл

@ -0,0 +1,224 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#include "CNTKLibrary.h"
namespace CNTK
{
// A collection of additional options that are applicable for all standard learners
// (after these options are set, they retain their value for the entire lifespan of a learner).
struct AdditionalLearningOptions
{
double l1RegularizationWeight = 0.0;
double l2RegularizationWeight = 0.0;
double gaussianNoiseInjectionStdDev = 0.0;
bool gradientClippingWithTruncation = false;
double gradientClippingThresholdPerSample = 0.0;
std::unordered_map<Variable, double> learningRateMultipliers;
};
// An abstract base class at the root of the standard learners hierarchy
// It implements most of the learner functionality, except for the actual update function,
// and adds a few pre-/postprocessing methods (which are invoked before and after the update).
class LearnerBase : public Learner
{
public:
CNTK_API virtual bool Update(const std::unordered_map<Variable, ValuePtr>& parameterValues,
const std::unordered_map<Variable, const ValuePtr>& gradientValues,
size_t trainingSampleCount) override final;
CNTK_API virtual Dictionary GetCheckpointState() const override;
CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override;
CNTK_API void SetAdditionalOptions(const AdditionalLearningOptions& additionalOptions)
{
m_additionalOptions = additionalOptions;
}
// TODO: should this be called ResetMomentum?
// needed for BlockMomemtumSGD to reset SGD momentum after aggregation.
CNTK_API void ResetSmoothedGradients();
// TODO: move learning rate and momentum scheduling and adjustment functionality
// inside the learner and drop these setters.
void SetLearningRate(double value) { m_learningRatePerSample = value; }
protected:
LearnerBase(const std::unordered_set<Variable>& parameters,
const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const = 0;
double ParameterDependentLearningRate(const Variable& parameter) const
{
return m_learningRatePerSample * m_additionalOptions.learningRateMultipliers.at(parameter);
}
std::string LearnerType() const;
double m_learningRatePerSample;
AdditionalLearningOptions m_additionalOptions;
std::unordered_map<Variable, ValuePtr> m_smoothedGradientValues;
// The following four static protected methods expose private methods of NDArrayView class
// (which declares LearnerBase as friend class), so that they are available to subclasses.
template <typename ElementType>
static std::shared_ptr<const Microsoft::MSR::CNTK::Matrix<ElementType>> GetMatrix(const NDArrayViewPtr arrayView);
template <typename ElementType>
static std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetWritableMatrix(NDArrayViewPtr arrayView);
template <typename ElementType>
static const Microsoft::MSR::CNTK::TensorView<ElementType>* GetTensorView(const NDArrayViewPtr arrayView);
template <typename ElementType>
static Microsoft::MSR::CNTK::TensorView<ElementType>* GetWritableTensorView(NDArrayViewPtr arrayView);
template <typename ElementType>
void ClipGradient(Microsoft::MSR::CNTK::Matrix<ElementType>& gradient, size_t actualMBSize) const;
// Performs additional preprocessing before calling the update method
// (gradient clipping and L2 regularization depending on the additional learning parameters).
template <typename ElementType>
void PreProcess(const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t actualMBSize) const;
// Performs additional postprocessing after the update method has been executed
// (noise injection and L1 regularization specified by the additional learning parameters).
template <typename ElementType>
void PostProcess(const Variable& parameter, const ValuePtr& gradientValue,
const ValuePtr& parameterValue, size_t actualMBSize) const;
private:
// Templatized update function, it invokes preprocess and postprocess using the provided
// template parameter and also invokes virtual Update method implemented in one of the subclasses.
template <typename ElementType>
void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
// TODO: make these functions friends of NDViewArray and move to Utils?
static bool HasNan(const ValuePtr& value, const char* name);
static void Print(const ValuePtr& value, const char* msg);
size_t m_sampleCount;
};
// Vanilla gradient descent optimization algorithm.
class LearnerSGD : public LearnerBase
{
public:
LearnerSGD(const std::unordered_set<Variable>& parameters,
const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
: LearnerBase(parameters, device),
m_momentumPerSample(0.0),
m_useNesterovAcceleration(false)
{
}
protected:
virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
template <typename ElementType>
void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
double m_momentumPerSample;
bool m_useNesterovAcceleration;
};
// SGD optimization with momentum.
class LearnerMomentumSGD : public LearnerSGD
{
public:
LearnerMomentumSGD(const std::unordered_set<Variable>& parameters,
const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
: LearnerSGD(parameters, device)
{
}
void SetMomentum(double value) { m_momentumPerSample = value; }
};
// Nesterov's accelerated SGDLearnerBase descent.
class LearnerNesterov : public LearnerSGD
{
public:
LearnerNesterov(const std::unordered_set<Variable>& parameters,
const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
: LearnerSGD(parameters, device)
{
m_useNesterovAcceleration = true;
}
};
class LearnerAdaGrad : public LearnerBase
{
public:
LearnerAdaGrad(const std::unordered_set<Variable>& parameters, bool needAveMultiplier,
const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
protected:
bool m_needAveMultiplier;
virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
template <typename ElementType>
void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
};
class LearnerFSAdaGrad : public LearnerMomentumSGD
{
public:
LearnerFSAdaGrad(const std::unordered_set<Variable>& parameters,
const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
protected:
virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
template <typename ElementType>
void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
};
class LearnerRMSProp : public LearnerBase
{
public:
LearnerRMSProp(const std::unordered_set<Variable>& parameters,
double gamma, double inc, double dec, double max, double min, bool needAveMultiplier,
const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
protected:
double m_gamma;
double m_inc;
double m_dec;
double m_max;
double m_min;
bool m_needAveMultiplier;
virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
template <typename ElementType>
void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
};
}

Просмотреть файл

@ -17,9 +17,9 @@ namespace CNTK
{
template <typename ElementType>
static TensorView<ElementType>* AllocateTensorView(const NDShape& viewShape,
const DeviceDescriptor& device,
void* dataBuffer,
size_t bufferSizeInBytes)
const DeviceDescriptor& device,
void* dataBuffer,
size_t bufferSizeInBytes)
{
if (dataBuffer == nullptr)
InvalidArgument("Cannot create a NDArrayView over a null data buffer");
@ -33,10 +33,10 @@ namespace CNTK
}
static void* AllocateTensorView(CNTK::DataType dataType,
const NDShape& viewShape,
const DeviceDescriptor& device,
void* dataBuffer,
size_t bufferSizeInBytes)
const NDShape& viewShape,
const DeviceDescriptor& device,
void* dataBuffer,
size_t bufferSizeInBytes)
{
switch (dataType)
{
@ -60,7 +60,7 @@ namespace CNTK
matrixDims.second,
AsCNTKImplDeviceId(device),
IsSparseStorageFormat(storageType) ? MatrixType::SPARSE : MatrixType::DENSE,
AsCNTKMatrixFormat(storageType));
AsCNTKImplMatrixFormat(storageType));
return new TensorView<ElementType>(matrix, AsTensorShape(viewShape));
}
@ -99,8 +99,22 @@ namespace CNTK
}
NDArrayView::NDArrayView(CNTK::DataType dataType, const DeviceDescriptor& device, CNTK::StorageFormat storageType, const NDShape& viewShape, bool readOnly, void* tensorView)
: m_dataType(dataType), m_device(device), m_storageFormat(storageType), m_viewShape(viewShape), m_isReadOnly(readOnly), m_tensorView(tensorView)
: m_dataType(dataType), m_device(device), m_storageFormat(storageType), m_viewShape(viewShape), m_isReadOnly(readOnly)
{
m_tensorView = std::shared_ptr<void>(tensorView, [this](void*) {
switch (m_dataType)
{
case DataType::Float:
delete GetTensorView<float>();
break;
case DataType::Double:
delete GetTensorView<double>();
break;
default:
LogicError("Unsupported DataType %s", DataTypeName(m_dataType));
break;
}
});
}
NDArrayView::NDArrayView(CNTK::DataType dataType, CNTK::StorageFormat storageType, const NDShape& viewShape, const DeviceDescriptor& device)
@ -108,6 +122,10 @@ namespace CNTK
{
}
NDArrayView::~NDArrayView()
{
}
void NDArrayView::SetValue(float value)
{
if (IsSparse())
@ -124,22 +142,6 @@ namespace CNTK
GetWritableMatrix<double>()->SetValue(value);
}
NDArrayView::~NDArrayView()
{
switch (m_dataType)
{
case DataType::Float:
delete GetTensorView<float>();
break;
case DataType::Double:
delete GetTensorView<double>();
break;
default:
LogicError("Unsupported DataType %s", DataTypeName(m_dataType));
break;
}
}
template <typename ElementType>
/*static*/ std::shared_ptr<Matrix<ElementType>> NDArrayView::GetMatrixImpl(const TensorView<ElementType>* tensorView, size_t rowColSplitPoint)
{
@ -150,7 +152,8 @@ namespace CNTK
size_t splitPoint = rowColSplitPoint;
if (splitPoint == NDArrayView::AutoSelectRowColSplitPoint)
{
// Determine the split point
// Determine the split point by determining which of the axes can be
// folded and selecting the non-foldable axis as the split point
std::vector<bool> dimsToDrop(tensorShape.GetRank(), false);
for (size_t k = 1; k < tensorShape.GetRank(); ++k)
if (tensorShape.CanFlatten(k))
@ -195,9 +198,9 @@ namespace CNTK
const TensorView<ElementType>* NDArrayView::GetTensorView() const
{
if (AsDataType<ElementType>() != m_dataType)
LogicError("NDArrayView::GetWritableTensorView: The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(m_dataType));
LogicError("NDArrayView::GetTensorView: The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(m_dataType));
return (const TensorView<ElementType>*)(m_tensorView);
return (const TensorView<ElementType>*)(m_tensorView.get());
}
template <typename ElementType>
@ -211,7 +214,7 @@ namespace CNTK
NDArrayViewPtr NDArrayView::DeepClone(bool readOnly/* = false*/) const
{
NDArrayViewPtr newView(new NDArrayView(this->GetDataType(), this->GetStorageFormat(), this->Shape(), this->Device()), [](_ReferenceCounter* ptr) { delete ptr; });
NDArrayViewPtr newView = MakeSharedObject<NDArrayView>(this->GetDataType(), this->GetStorageFormat(), this->Shape(), this->Device());
switch (m_dataType)
{
case DataType::Float:
@ -234,9 +237,7 @@ namespace CNTK
}
newView->m_isReadOnly = readOnly;
return NDArrayViewPtr(newView, [](_ReferenceCounter* ptr) {
delete ptr;
});
return newView;
}
void NDArrayView::CopyFrom(const NDArrayView& source)
@ -285,8 +286,7 @@ namespace CNTK
break;
}
auto aliasView = new NDArrayView(GetDataType(), Device(), GetStorageFormat(), Shape(), IsReadOnly() || readOnly, tensorView);;
return NDArrayViewPtr(aliasView, [](_ReferenceCounter* ptr) { delete ptr; });
return MakeSharedObject<NDArrayView>(GetDataType(), Device(), GetStorageFormat(), Shape(), IsReadOnly() || readOnly, tensorView);
}
// TODO: This could actually be strided?
@ -316,19 +316,18 @@ namespace CNTK
}
template <typename ElementType>
NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeStart, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
{
auto matrixDims = GetMatrixDimensions(shape);
auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeStart, (ElementType)rangeEnd, seed));
auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeBegin, (ElementType)rangeEnd, seed));
auto tensorView = new TensorView<ElementType>(randomUniformMatrix, AsTensorShape(shape));
auto view = new NDArrayView(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
return NDArrayViewPtr(view, [](_ReferenceCounter* ptr) { delete ptr; });
return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
}
// Explicit template instantiations
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeStart, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeStart, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
template CNTK_API const float* NDArrayView::DataBuffer<float>() const;
template CNTK_API const double* NDArrayView::DataBuffer<double>() const;
@ -339,8 +338,10 @@ namespace CNTK
template std::shared_ptr<const Matrix<float>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
template std::shared_ptr<const Matrix<double>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix<float>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix<double>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
template TensorView<float>* NDArrayView::GetWritableTensorView<float>();
template TensorView<double>* NDArrayView::GetWritableTensorView<double>();
template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);

Просмотреть файл

@ -17,15 +17,13 @@ namespace CNTK
static Matrix<char>* AllocateMatrix(const NDShape& viewShape, const DeviceDescriptor& device)
{
auto matrixDims = GetMatrixDimensions(viewShape);
auto maskMatrix = new Matrix<char>(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device));
maskMatrix->SetValue(1);
return maskMatrix;
return new Matrix<char>(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device));
}
NDMask::NDMask(const NDShape& shape, Matrix<char>* matrix)
: m_device(AsDeviceDescriptor(matrix->GetDeviceId())), m_maskShape(shape), m_matrixView(matrix)
: m_device(AsDeviceDescriptor(matrix->GetDeviceId())), m_maskShape(shape)
{
m_matrixView = std::shared_ptr<Matrix<char>>(matrix, [](Matrix<char>* ptr) { delete ptr; });
}
NDMask::NDMask(const NDShape& shape, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
@ -33,16 +31,17 @@ namespace CNTK
{
if (shape.NumAxes() > 2)
LogicError("NDMask instances with more than 2 axes are currently unsupported");
Clear();
}
NDMask::~NDMask()
{
delete m_matrixView;
}
void NDMask::MaskSection(const std::vector<size_t>& sectionOffset, const NDShape& sectionShape)
{
// TODO: Implement batching of masking operation for masks residing on GPUs to avoid making
// TODO: Implement batching of masking operation for masks residing on GPUs to avoid making
// GPU invocations for each MaskSection call.
if (sectionOffset.size() > m_maskShape.NumAxes())
@ -78,12 +77,13 @@ namespace CNTK
void NDMask::Clear()
{
// Clear the mask by marking all samples as Valid; i.e. a value of 1
GetMatrix()->SetValue(1);
}
Matrix<char>* NDMask::GetMatrix() const
{
return m_matrixView;
return m_matrixView.get();
}
void NDMask::CopyFrom(const NDMask& source)
@ -96,14 +96,14 @@ namespace CNTK
NDMaskPtr NDMask::DeepClone() const
{
NDMaskPtr newMask = new NDMask(this->Shape(), this->Device());
NDMaskPtr newMask = MakeSharedObject<NDMask>(this->Shape(), this->Device());
newMask->CopyFrom(*this);
return NDMaskPtr(newMask, [](_ReferenceCounter* ptr) { delete ptr; });
return newMask;
}
NDMaskPtr NDMask::Alias() const
{
return NDMaskPtr(new NDMask(this->Shape(), new Matrix<char>(GetMatrix()->AsReference())), [](_ReferenceCounter* ptr) { delete ptr; });
return MakeSharedObject<NDMask>(this->Shape(), new Matrix<char>(GetMatrix()->AsReference()));
}
}

Просмотреть файл

@ -6,329 +6,138 @@
#include "stdafx.h"
#include "CNTKLibrary.h"
#include "Utils.h"
#include "File.h"
using namespace std;
namespace CNTK
{
namespace _Internal
template <typename T>
void DictionaryValue::AllocateDataPtr(const T& value)
{
#pragma region _SimpleVector
static_assert(is_same<T, NDShape>::value || is_same<T, vector<DictionaryValue>>::value, "AllocateDataPtr called with invalid type");
m_data.m_ptr = new T(value);
}
template <typename T>
_SimpleVector<T>::_SimpleVector()
: m_vector(new std::vector<T>())
{
}
template <typename T>
void DictionaryValue::FreePtrAsType()
{
T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
delete typedPtr;
template <typename T>
_SimpleVector<T>::_SimpleVector(size_t numElements, const T& initVal/* = T()*/)
: m_vector(new std::vector<T>(numElements, initVal))
{
}
m_data.m_ptr = nullptr;
}
template <typename T>
_SimpleVector<T>::~_SimpleVector()
{
delete m_vector;
}
void DictionaryValue::FreeDataPtr()
{
if (m_valueType == Type::NDShape)
FreePtrAsType<NDShape>();
else if (m_valueType == Type::Vector)
FreePtrAsType<vector<DictionaryValue>>();
}
template <typename T>
_SimpleVector<T>::_SimpleVector(const _SimpleVector<T>& other)
: m_vector(new std::vector<T>(*other.m_vector))
{
}
Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us)
{
size_t version;
stream >> version;
template <typename T>
_SimpleVector<T>& _SimpleVector<T>::operator=(const _SimpleVector<T>& other)
stream >> us.m_valueType;
switch (us.ValueType())
{
if (this != &other)
case DictionaryValue::Type::Bool:
stream >> us.m_data.m_boolean;
break;
case DictionaryValue::Type::SizeT:
stream >> us.m_data.m_sizeT;
break;
case DictionaryValue::Type::Float:
stream >> us.m_data.m_float;
break;
case DictionaryValue::Type::Double:
stream >> us.m_data.m_double;
break;
case DictionaryValue::Type::NDShape:
{
size_t size;
stream >> size;
vector<size_t> dims(size);
for (auto i = 0; i < size; i++)
{
delete m_vector;
m_vector = new std::vector<T>(*other.m_vector);
stream >> dims[i];
}
return *this;
us.AllocateDataPtr(NDShape(dims));
break;
}
template <typename T>
_SimpleVector<T>::_SimpleVector(_SimpleVector<T>&& other)
: m_vector(nullptr)
case DictionaryValue::Type::Vector:
{
*this = std::move(other);
}
template <typename T>
_SimpleVector<T>& _SimpleVector<T>::operator=(_SimpleVector<T>&& other)
{
assert(this != &other);
delete m_vector;
m_vector = other.m_vector;
other.m_vector = nullptr;
return *this;
}
template <typename T>
T& _SimpleVector<T>::operator[](size_t idx)
{
assert(idx < Size());
return (*m_vector)[idx];
}
template <typename T>
const T& _SimpleVector<T>::operator[](size_t idx) const
{
assert(idx < Size());
return (*m_vector)[idx];
}
template <typename T>
size_t _SimpleVector<T>::Size() const
{
return m_vector->size();
}
template <typename T>
T* _SimpleVector<T>::Data()
{
return m_vector->data();
}
template <typename T>
const T* _SimpleVector<T>::Data() const
{
return m_vector->data();
}
template <typename T>
void _SimpleVector<T>::PushBack(const T& value)
{
m_vector->push_back(value);
}
template <typename T>
void _SimpleVector<T>::PushBack(T&& value)
{
m_vector->push_back(std::move(value));
}
template <typename ValueType>
bool operator==(const _SimpleVector<ValueType>& first, const _SimpleVector<ValueType>& second)
{
return *first.m_vector == *second.m_vector;
}
// Explicit template instantiations
template class _SimpleVector<Variable>;
template class _SimpleVector<size_t>;
template class _SimpleVector<Axis>;
template class _SimpleVector<FunctionPtr>;
template bool operator==(const _SimpleVector<size_t>& first, const _SimpleVector<size_t>& second);
#pragma endregion _SimpleVector
#pragma region _SimpleSet
template <typename KeyType>
_SimpleSet<KeyType>::_SimpleSet()
: m_set(new std::unordered_set<KeyType>())
{
}
template <typename KeyType>
_SimpleSet<KeyType>::~_SimpleSet()
{
delete m_set;
}
template <typename KeyType>
_SimpleSet<KeyType>::_SimpleSet(const _SimpleSet& other)
: m_set(nullptr)
{
*this = other;
}
template <typename KeyType>
_SimpleSet<KeyType>& _SimpleSet<KeyType>::operator=(const _SimpleSet& other)
{
if (this != &other)
size_t size;
stream >> size;
vector<DictionaryValue> values(size);
for (auto i = 0; i < size; i++)
{
delete m_set;
m_set = new std::unordered_set<KeyType>(*(other.m_set));
stream >> values[i];
}
return *this;
us.AllocateDataPtr(values);
break;
}
template <typename KeyType>
_SimpleSet<KeyType>::_SimpleSet(_SimpleSet&& other)
: m_set(nullptr)
{
*this = std::move(other);
default:
NOT_IMPLEMENTED;
}
return stream;
}
template <typename KeyType>
_SimpleSet<KeyType>& _SimpleSet<KeyType>::operator=(_SimpleSet&& other)
Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us)
{
stream << us.version;
stream << us.ValueType();
switch (us.ValueType())
{
assert(this != &other);
delete m_set;
m_set = other.m_set;
other.m_set = nullptr;
return *this;
}
template <typename KeyType>
bool _SimpleSet<KeyType>::Insert(const KeyType& key)
case DictionaryValue::Type::Bool:
stream << us.m_data.m_boolean;
break;
case DictionaryValue::Type::SizeT:
stream << us.m_data.m_sizeT;
break;
case DictionaryValue::Type::Float:
stream << us.m_data.m_float;
break;
case DictionaryValue::Type::Double:
stream << us.m_data.m_double;
break;
case DictionaryValue::Type::NDShape:
{
return m_set->insert(key).second;
}
template <typename KeyType>
bool _SimpleSet<KeyType>::Contains(const KeyType& key) const
{
return (m_set->find(key) != m_set->end());
}
template <typename KeyType>
size_t _SimpleSet<KeyType>::Size() const
{
return m_set->size();
}
template <typename KeyType>
_SimpleSet<KeyType>::operator _SimpleVector<KeyType>() const
{
_SimpleVector<KeyType> retVector;
for (auto iter = m_set->begin(); iter != m_set->end(); ++iter)
retVector.PushBack(*iter);
return retVector;
}
template <typename KeyType>
bool operator==(const _SimpleSet<KeyType>& first, const _SimpleSet<KeyType>& second)
{
return *first.m_set == *second.m_set;
}
// Explicit template instantiations
template class _SimpleSet<FunctionPtr>;
template class _SimpleSet<Variable>;
template class _SimpleSet<Placeholder>;
template class _SimpleSet<const Function*>;
template bool operator==(const _SimpleSet<Variable>& first, const _SimpleSet<Variable>& second);
template bool operator==(const _SimpleSet<Placeholder>& first, const _SimpleSet<Placeholder>& second);
#pragma endregion _SimpleSet
#pragma region _SimpleMap
template <typename KeyType, typename ValueType>
_SimpleMap<KeyType, ValueType>::_SimpleMap()
: m_map(new std::unordered_map<KeyType, ValueType>())
{
}
template <typename KeyType, typename ValueType>
_SimpleMap<KeyType, ValueType>::~_SimpleMap()
{
delete m_map;
}
template <typename KeyType, typename ValueType>
_SimpleMap<KeyType, ValueType>::_SimpleMap(const _SimpleMap& other)
: m_map(nullptr)
{
*this = other;
}
template <typename KeyType, typename ValueType>
_SimpleMap<KeyType, ValueType>& _SimpleMap<KeyType, ValueType>::operator=(const _SimpleMap& other)
{
if (this != &other)
NDShape* shapePtr = reinterpret_cast<NDShape*>(us.m_data.m_ptr);
auto size = shapePtr->NumAxes();
stream << size;
for (auto i = 0; i < size; i++)
{
delete m_map;
m_map = new std::unordered_map<KeyType, ValueType>(*(other.m_map));
stream << shapePtr->operator[](i);
}
return *this;
break;
}
template <typename KeyType, typename ValueType>
_SimpleMap<KeyType, ValueType>::_SimpleMap(_SimpleMap&& other)
: m_map(nullptr)
case DictionaryValue::Type::Vector:
{
*this = std::move(other);
vector<DictionaryValue>* vectorPtr =
reinterpret_cast<vector<DictionaryValue>*>(us.m_data.m_ptr);
auto size = vectorPtr->size();
stream << size;
for (auto i = 0; i < size; i++)
{
stream << vectorPtr->operator[](i);
}
break;
}
template <typename KeyType, typename ValueType>
_SimpleMap<KeyType, ValueType>& _SimpleMap<KeyType, ValueType>::operator=(_SimpleMap&& other)
{
assert(this != &other);
delete m_map;
m_map = other.m_map;
other.m_map = nullptr;
return *this;
default:
NOT_IMPLEMENTED;
}
template <typename KeyType, typename ValueType>
ValueType& _SimpleMap<KeyType, ValueType>::operator[](const KeyType& key)
{
return (*m_map)[key];
}
template <typename KeyType, typename ValueType>
const ValueType& _SimpleMap<KeyType, ValueType>::operator[](const KeyType& key) const
{
return (*m_map)[key];
}
template <typename KeyType, typename ValueType>
bool _SimpleMap<KeyType, ValueType>::Insert(const KeyType& key, const ValueType& value)
{
return m_map->insert({ key, value }).second;
}
template <typename KeyType, typename ValueType>
bool _SimpleMap<KeyType, ValueType>::Contains(const KeyType& key) const
{
return (m_map->find(key) != m_map->end());
}
template <typename KeyType, typename ValueType>
size_t _SimpleMap<KeyType, ValueType>::Size() const
{
return m_map->size();
}
template <typename KeyType, typename ValueType>
_SimpleSet<KeyType> _SimpleMap<KeyType, ValueType>::Keys() const
{
_SimpleSet<KeyType> keys;
for (auto iter = m_map->begin(); iter != m_map->end(); ++iter)
keys.Insert(iter->first);
return keys;
}
// Explicit template instantiations
template class _SimpleMap<Variable, ValuePtr>;
template class _SimpleMap<Variable, const ValuePtr>;
template class _SimpleMap<Placeholder, Variable>;
#pragma endregion _SimpleMap
return stream;
}
Dictionary::Dictionary()
: m_dictionaryData(new std::unordered_map < std::wstring, DictionaryValue>)
: m_dictionaryData(new unordered_map <wstring, DictionaryValue>)
{
}
@ -340,7 +149,7 @@ namespace CNTK
Dictionary::Dictionary(Dictionary&& other)
: m_dictionaryData(nullptr)
{
*this = std::move(other);
*this = move(other);
}
Dictionary& Dictionary::operator=(Dictionary&& other)
@ -369,4 +178,130 @@ namespace CNTK
{
return (m_dictionaryData->find(key) != m_dictionaryData->end());
}
Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us)
{
stream << us.version;
stream << us.m_dictionaryData->size();
for (auto it = us.m_dictionaryData->begin(); it != us.m_dictionaryData->end(); ++it)
{
stream << it->first;
stream << it->second;
}
return stream;
}
Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us)
{
size_t version;
stream >> version;
size_t size;
stream >> size;
us.m_dictionaryData->reserve(size);
for (auto i = 0; i < size; i++)
{
wstring key;
stream >> key;
DictionaryValue value;
stream >> value;
us.m_dictionaryData->insert(make_pair(key, value));
}
return stream;
}
template <typename T>
vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
{
if (viewPtr->IsSparse())
{
LogicError("Sparse NDArrayView cannot be serialized into a vector.");
}
auto numElements = viewPtr->Shape().TotalSize();
vector<DictionaryValue> values(numElements);
NDArrayViewPtr cpuDataViewPtr = viewPtr;
if ((viewPtr->Device().Type() != DeviceKind::CPU))
{
cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
cpuDataViewPtr->CopyFrom(*viewPtr);
}
const T* buffer = cpuDataViewPtr->DataBuffer<T>();
for (auto i = 0; i < numElements; ++i)
{
T v = buffer[i];
values[i] = DictionaryValue(v);
}
return values;
}
template <typename T>
void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
{
if (viewPtr->IsSparse())
{
LogicError("Sparse NDArrayView cannot be deserialized from a vector.");
}
auto numElements = viewPtr->Shape().TotalSize();
if (values.size() != numElements)
{
LogicError("Number of elements (%lu) in the deserialized representation does not match the expected value (%lu)",
values.size(), numElements);
}
NDArrayViewPtr cpuDataViewPtr = viewPtr;
if ((viewPtr->Device().Type() != DeviceKind::CPU))
{
cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
}
T* buffer = cpuDataViewPtr->WritableDataBuffer<T>();
for (auto i = 0; i < numElements; ++i)
{
buffer[i] = values[i].GetValue<T>();
}
if ((viewPtr->Device().Type() != DeviceKind::CPU))
{
viewPtr->CopyFrom(*cpuDataViewPtr);
}
}
// TODO: we store the type info for every element in the vector, which is extremely redundant.
// Instead, it'd be nice to introduce some sort of DictionaryValueVector.
vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
{
switch (viewPtr->GetDataType())
{
case DataType::Float:
return SerializeToVector<float>(viewPtr);
case DataType::Double:
return SerializeToVector<double>(viewPtr);
default:
LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
}
}
void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
{
switch (viewPtr->GetDataType())
{
case DataType::Float:
DeserializeFromVector<float>(viewPtr, values);
break;
case DataType::Double:
DeserializeFromVector<double>(viewPtr, values);
break;
default:
LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
}
}
template void DictionaryValue::AllocateDataPtr<NDShape>(const NDShape& value);
template void DictionaryValue::AllocateDataPtr<vector<DictionaryValue>>(const vector<DictionaryValue>& value);
}

Просмотреть файл

@ -15,244 +15,6 @@ namespace CNTK
// Forward declarations
class Dictionary;
class DictionaryValue
{
public:
enum class Type : unsigned int
{
None,
Bool,
SizeT,
Double,
NDShape,
Vector
};
static const char* TypeName(Type type)
{
if (type == Type::None)
return "None";
else if (type == Type::Bool)
return "Bool";
else if (type == Type::SizeT)
return "SizeT";
else if (type == Type::Double)
return "Double";
else if (type == Type::NDShape)
return "NDShape";
else if (type == Type::Vector)
return "Vector";
else
LogicError("Unknown DictionaryValue::Type");
}
public:
DictionaryValue()
: m_valueType(Type::None)
{
}
DictionaryValue(bool value)
: m_valueType(GetValueType<bool>())
{
m_data.m_boolean = value;
}
DictionaryValue(size_t value)
: m_valueType(GetValueType<size_t>())
{
m_data.m_sizeT = value;
}
DictionaryValue(double value)
: m_valueType(GetValueType<double>())
{
m_data.m_double = value;
}
template <typename T>
DictionaryValue(const T& value)
: m_valueType(GetValueType<T>())
{
static_assert(std::is_same<T, NDShape>::value ||
std::is_same<T, _Internal::_SimpleVector<DictionaryValue>>::value,
"Unsupported ValueType");
AllocateDataPtr(value);
}
DictionaryValue(const DictionaryValue& other)
: m_valueType(Type::Bool)
{
*this = other;
}
DictionaryValue& operator=(const DictionaryValue& other)
{
if (this != &other)
{
FreeDataPtr();
m_valueType = other.m_valueType;
m_data = other.m_data;
if (other.m_valueType == Type::NDShape)
AllocateDataPtr(other.GetValue<NDShape>());
else if (other.m_valueType == Type::Vector)
AllocateDataPtr(other.GetValue<_Internal::_SimpleVector<DictionaryValue>>());
}
return *this;
}
~DictionaryValue()
{
FreeDataPtr();
}
template <typename T, typename std::enable_if<std::is_same<T, bool>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return m_data.m_boolean;
}
template <typename T, typename std::enable_if<std::is_same<T, size_t>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return m_data.m_sizeT;
}
template <typename T, typename std::enable_if<std::is_same<T, double>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return m_data.m_double;
}
template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value || std::is_same<T, _Internal::_SimpleVector<DictionaryValue>>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return *(reinterpret_cast<T*>(m_data.m_ptr));
}
bool HasValue() const
{
return m_valueType != Type::None;
}
Type ValueType() const
{
return m_valueType;
}
private:
template <typename T>
static Type GetValueType()
{
static_assert(std::is_same<T, bool>::value ||
std::is_same<T, size_t>::value ||
std::is_same<T, double>::value ||
std::is_same<T, NDShape>::value ||
std::is_same<T, _Internal::_SimpleVector<DictionaryValue>>::value ||
std::is_same<T, CNTK::Dictionary>::value,
"Unsupported ValueType");
if (std::is_same<T, bool>::value)
return Type::Bool;
else if (std::is_same<T, size_t>::value)
return Type::SizeT;
else if (std::is_same<T, double>::value)
return Type::Double;
else if (std::is_same<T, NDShape>::value)
return Type::NDShape;
else if (std::is_same<T, _Internal::_SimpleVector<DictionaryValue>>::value)
return Type::Vector;
}
template <typename T>
void VerifyType() const
{
if (GetValueType<T>() != m_valueType)
RuntimeError("Reading a DictionaryValue as the wrong type; Reading as type %s when actual type is %s", typeid(T).name(), DictionaryValue::TypeName(m_valueType));
}
template <typename T>
void AllocateDataPtr(const T& value)
{
static_assert(std::is_same<T, NDShape>::value || std::is_same<T, _Internal::_SimpleVector<DictionaryValue>>::value, "AllocateDataPtr called with invalid type");
m_data.m_ptr = new T(value);
}
template <typename T>
void FreePtrAsType()
{
T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
delete typedPtr;
m_data.m_ptr = nullptr;
}
void FreeDataPtr()
{
if (m_valueType == Type::NDShape)
FreePtrAsType<NDShape>();
else if (m_valueType == Type::Vector)
FreePtrAsType<_Internal::_SimpleVector<DictionaryValue>>();
}
private:
Type m_valueType;
union ValueData
{
bool m_boolean;
size_t m_sizeT;
double m_double;
void* m_ptr;
} m_data;
};
class Dictionary
{
public:
Dictionary();
~Dictionary();
// Disallow copy contruction and assignment
Dictionary(const Dictionary&) = delete;
Dictionary& operator=(const Dictionary&) = delete;
Dictionary(Dictionary&& other);
Dictionary& operator=(Dictionary&& other);
DictionaryValue& operator[](const std::wstring& key)
{
return operator[](key.c_str());
}
DictionaryValue& operator[](const wchar_t* key);
DictionaryValue operator[](const std::wstring& key) const
{
return operator[](key.c_str());
}
DictionaryValue operator[](const wchar_t* key) const;
bool Contains(const std::wstring& key) const
{
return Contains(key.c_str());
}
bool Contains(const wchar_t* key) const;
private:
std::unordered_map<std::wstring, DictionaryValue>* m_dictionaryData;
};
// Helper to get the size of an element of the specified DataType
inline size_t ElementSize(DataType dataType)
{
@ -266,15 +28,15 @@ namespace CNTK
inline DEVICEID_TYPE AsCNTKImplDeviceId(const DeviceDescriptor& device)
{
if (device.Type() == DeviceType::CPU)
if (device.Type() == DeviceKind::CPU)
return -1;
else if (device.Type() == DeviceType::GPU)
else if (device.Type() == DeviceKind::GPU)
return device.Id();
else
NOT_IMPLEMENTED;
}
inline Microsoft::MSR::CNTK::MatrixFormat AsCNTKMatrixFormat(StorageFormat storageFormat)
inline Microsoft::MSR::CNTK::MatrixFormat AsCNTKImplMatrixFormat(StorageFormat storageFormat)
{
if (storageFormat == StorageFormat::Dense)
return Microsoft::MSR::CNTK::MatrixFormat::matrixFormatDense;
@ -357,4 +119,13 @@ namespace CNTK
return{ matrixRowSize, matrixColSize };
}
inline bool IsSparseInput(const Variable& var)
{
return var.IsInput() && var.IsSparse();
}
std::vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr);
void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const std::vector<DictionaryValue>& values);
}

Просмотреть файл

@ -15,20 +15,21 @@ namespace CNTK
Value::Value(const NDArrayViewPtr& data, const NDMaskPtr& mask)
: m_data(data), m_mask(mask)
{
if ((mask != nullptr) && (mask->Shape().NumAxes() > data->Shape().NumAxes()))
InvalidArgument("The number of axes of the mask of a Value object cannot exceed the number of axes of the data NDArrayView object");
if (mask != nullptr)
{
auto dataShape = data->Shape();
auto maskShape = mask->Shape();
if (maskShape.NumAxes() > dataShape.NumAxes())
InvalidArgument("The number of axes (%d) of the mask of a Value object cannot exceed the number of axes (%d) of the data NDArrayView object", (int)maskShape.NumAxes(), (int)dataShape.NumAxes());
if (dataShape.SubShape(dataShape.NumAxes() - maskShape.NumAxes()) != maskShape)
InvalidArgument("Invalid Value object; the data and mask are incompatible. The trailing dimensions of the data do not match the dimensions of the mask");
InvalidArgument("Invalid Value object; the data and mask are incompatible. The trailing dimensions of the data (%S) do not match the dimensions of the mask (%S)", dataShape.AsString().c_str(), maskShape.AsString().c_str());
}
}
template <typename T>
static NDMaskPtr CreateMask(size_t sampleSize, const std::vector<std::vector<T>>& sequences, const DeviceDescriptor& device)
static NDMaskPtr CreateMask(size_t numElementsPerSample, const std::vector<std::vector<T>>& sequences, const DeviceDescriptor& device)
{
size_t numSequences = sequences.size();
std::vector<size_t> sequenceLengths(numSequences);
@ -36,7 +37,7 @@ namespace CNTK
bool needsMask = false;
for (size_t i = 0; i < numSequences; ++i)
{
sequenceLengths[i] = sequences[i].size() / sampleSize;
sequenceLengths[i] = sequences[i].size() / numElementsPerSample;
if (maxSequenceLength < sequenceLengths[i])
maxSequenceLength = sequenceLengths[i];
@ -45,11 +46,12 @@ namespace CNTK
needsMask = true;
}
// If needed, create a mask to account for variability in lengths of specified sequences
NDMaskPtr deviceValueMask;
if (needsMask)
{
NDShape valueMaskShape = { maxSequenceLength, numSequences };
deviceValueMask = NDMaskPtr(new NDMask(valueMaskShape, device), [](_Internal::_ReferenceCounter* ptr) {delete ptr; });
deviceValueMask = MakeSharedObject<NDMask>(valueMaskShape, device);
for (size_t i = 0; i < numSequences; ++i)
deviceValueMask->MaskSection({ sequenceLengths[i], i }, { NDShape::InferredDimension, 1 });
}
@ -86,23 +88,23 @@ namespace CNTK
}
colStarts[numSequences * maxSequenceLength] = (SparseIndexType)(nonZeroValues.size());
NDArrayViewPtr deviceValueData(new NDArrayView(valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly), [](_ReferenceCounter* ptr) { delete ptr; });
return ValuePtr(new Value(deviceValueData, deviceValueMask), [](_ReferenceCounter* ptr) { delete ptr; });
NDArrayViewPtr deviceValueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly);
return MakeSharedObject<Value>(deviceValueData, deviceValueMask);
}
template <typename ElementType>
/*static*/ ValuePtr Value::Create(const NDShape& sampleShape, const std::vector<std::vector<ElementType>>& sequences, const DeviceDescriptor& device, bool readOnly/* = false*/)
{
size_t sampleSize = sampleShape.TotalSize();
NDMaskPtr deviceValueMask = CreateMask(sampleSize, sequences, device);
size_t numElementsPerSample = sampleShape.TotalSize();
NDMaskPtr deviceValueMask = CreateMask(numElementsPerSample, sequences, device);
size_t maxSequenceLength = (deviceValueMask == nullptr) ? sequences[0].size() : deviceValueMask->Shape()[0];
size_t numSequences = sequences.size();
NDShape valueDataShape = sampleShape.AppendShape({ maxSequenceLength, numSequences });
NDArrayViewPtr valueData(new NDArrayView(AsDataType<ElementType>(), valueDataShape, DeviceDescriptor::CPUDevice()), [](_ReferenceCounter* ptr) { delete ptr; });
NDArrayViewPtr valueData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), valueDataShape, DeviceDescriptor::CPUDevice());
ElementType* dataBuffer = valueData->WritableDataBuffer<ElementType>();
for (size_t i = 0; i < numSequences; ++i)
std::copy(sequences[i].data(), sequences[i].data() + sequences[i].size(), dataBuffer + (maxSequenceLength * i * sampleSize));
std::copy(sequences[i].data(), sequences[i].data() + sequences[i].size(), dataBuffer + (maxSequenceLength * i * numElementsPerSample));
NDArrayViewPtr deviceValueData;
if (device == DeviceDescriptor::CPUDevice())
@ -114,13 +116,13 @@ namespace CNTK
}
else
{
deviceValueData = NDArrayViewPtr(new NDArrayView(AsDataType<ElementType>(), valueDataShape, device), [](_ReferenceCounter* ptr) { delete ptr; });
deviceValueData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), valueDataShape, device);
deviceValueData->CopyFrom(*valueData);
if (readOnly)
deviceValueData = deviceValueData->Alias(true);
}
return ValuePtr(new Value(deviceValueData, deviceValueMask), [](_ReferenceCounter* ptr) { delete ptr; });
return MakeSharedObject<Value>(deviceValueData, deviceValueMask);
}
/*virtual*/ Value::~Value()
@ -142,13 +144,13 @@ namespace CNTK
/*virtual*/ ValuePtr Value::DeepClone(bool readOnly/* = false*/) const
{
// TODO: Check if this is a derived type and throw an exception in that case
return ValuePtr(new Value(Data()->DeepClone(readOnly), (Mask() != nullptr) ? Mask()->DeepClone() : nullptr), [](_ReferenceCounter* ptr) { delete ptr; });
return MakeSharedObject<Value>(Data()->DeepClone(readOnly), (Mask() != nullptr) ? Mask()->DeepClone() : nullptr);
}
/*virtual*/ ValuePtr Value::Alias(bool readOnly/* = false*/) const
{
// TODO: Check if this is a derived type and throw an exception in that case
return ValuePtr(new Value(Data()->Alias(readOnly), (Mask() != nullptr) ? Mask()->Alias() : nullptr), [](_ReferenceCounter* ptr) { delete ptr; });
return MakeSharedObject<Value>(Data()->Alias(readOnly), (Mask() != nullptr) ? Mask()->Alias() : nullptr);
}
/*virtual*/ void Value::CopyFrom(const Value& source)

Просмотреть файл

@ -11,4 +11,9 @@ namespace CNTK
: Variable(function->Output())
{
}
FunctionPtr Variable::Owner() const
{
return m_dataFields->m_ownerFunction->shared_from_this();
}
}

Просмотреть файл

@ -106,22 +106,18 @@ public:
~BestGpu();
void Init();
void SetAllowedDevices(const std::vector<int>& devices); // only allow certain GPUs
bool DeviceAllowed(int device);
void DisallowDevice(int device)
{
assert((device >= -1) && (device <= 31));
if (device < 0)
m_disallowCPUDevice = true;
else
m_allowedDevices &= ~(1 << device);
}
bool DeviceAllowed(int deviceId);
void DisallowUnsupportedDevices();
void DisallowDevice(int deviceId);
void AllowAll(); // reset to allow all GPUs (no allowed list)
bool UseMultiple(); // using multiple GPUs?
int GetDevice(BestGpuFlags flags = bestGpuNormal); // get a single device
static const int AllDevices = -1; // can be used to specify all GPUs in GetDevices() call
static const int RequeryDevices = -2; // Requery refreshing statistics and picking the same number as last query
static const int MininumCCMajorForGpu = 3; // cntk supports GPUs with Compute Capability > 3.0
std::vector<int> GetDevices(int number = AllDevices, BestGpuFlags flags = bestGpuNormal); // get multiple devices
std::vector<ProcessorData *> GetProcessorData();
private:
bool LockDevice(int deviceId, bool trial = true);
};
@ -156,6 +152,8 @@ static DEVICEID_TYPE SelectDevice(DEVICEID_TYPE deviceId, bool bLockGPU, const i
{
g_bestGpu->DisallowDevice(excludedDevices[i]);
}
g_bestGpu->DisallowUnsupportedDevices();
}
bestDeviceId = (DEVICEID_TYPE)g_bestGpu->GetDevice(BestGpuFlags(bLockGPU ? (bestGpuAvoidSharing | bestGpuExclusiveLock) : bestGpuAvoidSharing));
@ -345,22 +343,32 @@ int BestGpu::GetDevice(BestGpuFlags bestFlags)
void BestGpu::SetAllowedDevices(const std::vector<int>& devices)
{
m_allowedDevices = 0;
for (int device : devices)
for (int deviceId : devices)
{
m_allowedDevices |= (1 << device);
m_allowedDevices |= (1 << deviceId);
}
}
// DeviceAllowed - is a particular device allowed?
// returns: true if the device is allowed, otherwise false
bool BestGpu::DeviceAllowed(int device)
bool BestGpu::DeviceAllowed(int deviceId)
{
assert((device >= -1) && (device <= 31));
assert((deviceId >= -1) && (deviceId <= 31));
if (device < 0)
if (deviceId < 0)
return !m_disallowCPUDevice;
else
return !!(m_allowedDevices & (1 << device));
return !!(m_allowedDevices & (1 << deviceId));
}
void BestGpu::DisallowDevice(int deviceId)
{
assert((deviceId >= -1) && (deviceId <= 31));
if (deviceId < 0)
m_disallowCPUDevice = true;
else
m_allowedDevices &= ~(1 << deviceId);
}
// AllowAll - Reset the allowed filter to allow all GPUs
@ -527,6 +535,68 @@ std::vector<int> BestGpu::GetDevices(int number, BestGpuFlags p_bestFlags)
return best; // return the array of the best GPUs
}
// disallow devices wich don't comply with compute capability restriction when cntk runs with deviceId = 'auto'
void BestGpu::DisallowUnsupportedDevices()
{
for (auto pd : m_procData)
{
if (pd->deviceProp.major < BestGpu::MininumCCMajorForGpu)
{
DisallowDevice(pd->deviceId);
}
}
}
GpuData GetGpuData(DEVICEID_TYPE deviceId)
{
std::vector<GpuData> gpusData = GetAllGpusData();
auto it = std::find_if(gpusData.begin(), gpusData.end(), [&deviceId](const GpuData& gpu){return gpu.deviceId == deviceId;});
if (it != gpusData.end())
{
return *it;
}
return GpuData(0, 0, deviceId, 0, GpuValidity::UnknownDevice, "", 0);
}
// populate a vector with data (id, major/minor version, cuda cores, name and memory) for each gpu device in the machine
std::vector<GpuData> GetAllGpusData()
{
std::vector<GpuData> data;
auto bestGpu = make_unique<BestGpu>();
std::vector<ProcessorData*> processorData = bestGpu->GetProcessorData();
for (ProcessorData* pd : processorData)
{
GpuValidity validity = GpuValidity::UnknownDevice;
if (pd->deviceProp.major < BestGpu::MininumCCMajorForGpu)
{
validity = GpuValidity::ComputeCapabilityNotSupported;
}
else
{
validity = GpuValidity::Valid;
}
size_t totalMemory = pd->deviceProp.totalGlobalMem/(1024*1024); //From bytes to MBytes
GpuData gpuData = GpuData(pd->deviceProp.major, pd->deviceProp.minor, pd->deviceId, pd->cores, validity, string(pd->deviceProp.name), totalMemory);
data.push_back(gpuData);
}
return data;
}
std::vector<ProcessorData*> BestGpu::GetProcessorData()
{
return m_procData;
}
// QueryNvmlData - Query data from the Nvidia Management Library, and accumulate counters,
// In case failure, this function simply backs out without filling in the data structure and without setting m_nvmlData.
void BestGpu::QueryNvmlData()

Просмотреть файл

@ -70,14 +70,14 @@ void Eval<ElemType>::GetEvalClass(const std::string& config)
}
// create a variable of each type just to call the proper templated version
ElemType elemType = ElemType();
GetEvalProc getEvalProc = (GetEvalProc) Plugin::Load(module, GetEvalName(elemType));
GetEvalProc getEvalProc = (GetEvalProc) m_plugin->Load(module, GetEvalName(elemType));
getEvalProc(&m_eval);
}
// Eval Constructor
// options - [in] string of options (i.e. "-windowsize:11 -addenergy") data reader specific
template <class ElemType>
Eval<ElemType>::Eval(const std::string& config)
Eval<ElemType>::Eval(const std::string& config) : m_plugin(make_shared<Plugin>())
{
GetEvalClass(config);
m_eval->Init(config);

Просмотреть файл

@ -8,15 +8,46 @@
// #define CPUONLY // #define this to build without GPU support nor needing the SDK installed
#include "CommonMatrix.h"
#include <vector>
// define IConfigRecord and ConfigParameters as incomplete types, in order to avoid having to include "ScriptableObjects.h" and "Config.h", as that confuses some .CU code
namespace Microsoft { namespace MSR { namespace ScriptableObjects { struct IConfigRecord; }}}
namespace Microsoft { namespace MSR { namespace CNTK {
using namespace std;
#ifndef CPUONLY
enum class GpuValidity
{
Valid,
UnknownDevice,
ComputeCapabilityNotSupported
};
struct GpuData
{
int versionMajor;
int versionMinor;
int deviceId;
int cudaCores;
GpuValidity validity;
string name;
size_t totalMemory;
GpuData(int versionMajor, int versionMinor, int deviceId, int cudaCores, GpuValidity validity, const string& name, size_t totalMemory)
:versionMajor(versionMajor), versionMinor(versionMinor), deviceId(deviceId), cudaCores(cudaCores), validity(validity), name(name), totalMemory(totalMemory)
{
}
};
std::vector<GpuData> GetAllGpusData();
GpuData GetGpuData(DEVICEID_TYPE deviceId);
class ConfigParameters;
DEVICEID_TYPE DeviceFromConfig(const ConfigParameters& config);
DEVICEID_TYPE DeviceFromConfig(const ScriptableObjects::IConfigRecord& config);
#else
template <class ConfigRecordType>
static inline DEVICEID_TYPE DeviceFromConfig(const ConfigRecordType& /*config*/)

Просмотреть файл

@ -25,8 +25,7 @@
#include <map>
#include <vector>
#include <string>
#include "Basics.h"
#include <memory>
namespace Microsoft { namespace MSR { namespace CNTK {
@ -110,12 +109,14 @@ void EVAL_API GetEval(IEvaluateModel<ElemType>** peval);
extern "C" EVAL_API void GetEvalF(IEvaluateModel<float>** peval);
extern "C" EVAL_API void GetEvalD(IEvaluateModel<double>** peval);
class Plugin;
template <typename ElemType>
class Eval : public IEvaluateModel<ElemType>, protected Plugin
class Eval : public IEvaluateModel<ElemType>
{
private:
IEvaluateModel<ElemType>* m_eval; // evaluation class pointer
std::shared_ptr<Plugin> m_plugin;
void GetEvalClass(const std::string& config);
@ -225,7 +226,8 @@ struct VectorRef
size_t m_size; // ElemTypes used.
VectorRef() : m_vector(nullptr), m_capacity(0), m_size(0) {}
void InitFrom(std::vector<ElemType>& src) { m_vector = src.data(); m_capacity = src.capacity(); m_size = src.size(); }
void InitFrom(std::vector<ElemType>& src) { InitFrom(src.data(), src.capacity(), src.size()); }
void InitFrom(ElemType* data, size_t capacity, size_t size) { m_vector = data; m_capacity = capacity; m_size = size; }
size_t size() const { return m_size; }
size_t capacity() const { return m_capacity; }
ElemType* data() { return m_vector; }
@ -280,7 +282,7 @@ class VariableSchema : public std::vector<VariableLayout>
Values<ElemType> CreateBuffers(const std::vector<size_t>& maxLengths)
{
if (maxLengths.size() != size())
throw std::exception("Expected max lengths for all variables.");
throw std::runtime_error("Expected max lengths for all variables.");
Values<ElemType> buffers(size());
for (size_t i = 0; i < size(); ++i)

Просмотреть файл

@ -128,5 +128,11 @@ public:
{
return currentseed;
}
bool IsRandomizationDisabled() const
{
return randomizationrange == randomizeDisable;
}
};
} } }
}}}

Просмотреть файл

@ -29,7 +29,8 @@ public:
runtime_error(msg)
{
}
virtual void PrintError(const std::wstring& linePrefix) const = 0;
virtual std::wstring GetError(const std::wstring& /*linePrefix*/) const = 0;
virtual void PrintError(const std::wstring& /*linePrefix*/) const = 0;
};
// -----------------------------------------------------------------------

Просмотреть файл

@ -17,6 +17,11 @@ inline bool AreEqualIgnoreCase(
const std::basic_string<TElement, char_traits<TElement>, allocator<TElement>>& s1,
const std::basic_string<TElement, char_traits<TElement>, allocator<TElement> >& s2)
{
if (s1.size() != s2.size())
{
return false;
}
return std::equal(s1.begin(), s1.end(), s2.begin(), [](const TElement& a, const TElement& b)
{
return std::tolower(a) == std::tolower(b);

Просмотреть файл

@ -665,7 +665,8 @@ public:
std::swap(m_strides[i], m_strides[j]);
}
// Flatten the shape in place to a 2D tensor.
// Flatten a tensor shape into a 2D tensor, where splitPoint is the first index to go into the second dimension
// The tensor shape must be flattenable this way, i.e. each of the two index ranges must be dense.
void FlattenTo2DInPlace(size_t splitPoint, const char* errorPrefix/* = nullptr*/)
{
// check & print meaningful error message

Просмотреть файл

@ -411,7 +411,7 @@ static inline void byteswap(V &v) throw()
// execute a block with retry
// Block must be restartable.
// Use this when writing small files to those unreliable Windows servers.
// Use this when writing/reading small files to those unreliable Windows servers.
// TODO: This will fail to compile under VS 2008--we need an #ifdef around this
template <typename FUNCTION>
static void attempt(int retries, const FUNCTION &body)

Просмотреть файл

@ -30,6 +30,7 @@
#include <assert.h>
#include <string.h> // for strerror()
#include <stdexcept> // for exception
#include <fcntl.h>
// ----------------------------------------------------------------------------
// fopenOrDie(): like fopen() but terminate with err msg in case of error.
@ -591,7 +592,8 @@ void fgetfile(const std::wstring& pathname, std::vector<char>& buffer);
void fgetfile(FILE* f, std::vector<char>& buffer);
namespace msra { namespace files {
void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines);
void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines, int numberOfTries = 1);
static inline std::vector<std::string> fgetfilelines(const std::wstring& pathname)
{
std::vector<char> buffer;
@ -599,7 +601,7 @@ static inline std::vector<std::string> fgetfilelines(const std::wstring& pathnam
fgetfilelines(pathname, buffer, lines);
return lines;
}
std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer);
std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, int numberOfTries = 1);
}}
@ -698,8 +700,18 @@ class auto_file_ptr
{
if (f && f != stdin && f != stdout && f != stderr)
{
bool readMode = false;
#ifdef _WIN32
if ((f->_flag&_IOREAD) == _IOREAD)
readMode = true;
#else
int mode = fcntl(fileno(f), F_GETFL);
if ((mode & O_ACCMODE) == O_RDONLY)
readMode = true;
#endif
int rc = ::fclose(f);
if ((rc != 0) && !std::uncaught_exception())
if (!readMode && (rc != 0) && !std::uncaught_exception())
RuntimeError("auto_file_ptr: failed to close file: %s", strerror(errno));
f = NULL;

Просмотреть файл

@ -1251,7 +1251,7 @@ public:
// BUGBUG: we only really support one archive file at this point
// read the TOC in one swoop
std::vector<char> textbuffer;
auto toclines = msra::files::fgetfilelines(tocpath, textbuffer);
auto toclines = msra::files::fgetfilelines(tocpath, textbuffer, 3);
// parse it one by one
size_t archiveindex = SIZE_MAX; // its index

Просмотреть файл

@ -16,6 +16,7 @@
#endif
#define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1
#include "Basics.h"
#include "basetypes.h" //for attemp()
#include "fileutil.h"
#include "ProgressTracing.h"
@ -1632,6 +1633,11 @@ static size_t fgetfilechars(const std::wstring& path, vector<char>& buffer)
return len;
}
static void fgetfilechars(const std::wstring& path, vector<char>& buffer, size_t& len)
{
len = fgetfilechars(path, buffer);
}
template <class LINES>
static void strtoklines(char* s, LINES& lines)
{
@ -1639,10 +1645,14 @@ static void strtoklines(char* s, LINES& lines)
lines.push_back(p);
}
void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines)
void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines, int numberOfTries)
{
// load it into RAM in one huge chunk
const size_t len = fgetfilechars(path, buffer);
size_t len = 0;
msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
{
// load it into RAM in one huge chunk
fgetfilechars(path, buffer, len);
});
// parse into lines
lines.resize(0);
@ -1651,11 +1661,15 @@ void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer,
}
// same as above but returning const char* (avoiding the memory allocation)
vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer)
vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer, int numberOfTries)
{
// load it into RAM in one huge chunk
const size_t len = fgetfilechars(path, buffer);
size_t len = 0;
msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
{
// load it into RAM in one huge chunk
fgetfilechars(path, buffer, len);
});
// parse into lines
vector<char*> lines;
lines.reserve(len / 20);

Просмотреть файл

@ -72,6 +72,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
else if (nodeType == OperationNameOf(InvStdDevNode)) return New<InvStdDevNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(KhatriRaoProductNode)) return New<KhatriRaoProductNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(LogNode)) return New<LogNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(LogPlusNode)) return New<LogPlusNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(LogSoftmaxNode)) return New<LogSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(LookupTableNode)) return New<LookupTableNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(MatrixL1RegNode)) return New<MatrixL1RegNode<ElemType>>(forward<_Types>(_Args)...);
@ -657,6 +658,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Plus(
return net.AddNodeToNetAndAttachInputs(New<PlusNode<ElemType>>(net.GetDeviceId(), nodeName), { a, b });
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LogPlus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<LogPlusNode<ElemType>>(net.GetDeviceId(), nodeName), { a, b });
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Less(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
{

Просмотреть файл

@ -134,6 +134,7 @@ public:
ComputationNodePtr InvStdDev(const ComputationNodePtr a, const std::wstring nodeName = L"");
ComputationNodePtr KhatriRaoProduct(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr Log(const ComputationNodePtr a, const std::wstring nodeName = L"");
ComputationNodePtr LogPlus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr LogSoftmax(const ComputationNodePtr a, const std::wstring nodeName = L"");
ComputationNodePtr Logistic(const ComputationNodePtr a, const ComputationNodePtr b, const ComputationNodePtr c, const std::wstring nodeName = L"");
ComputationNodePtr Logistic(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");

Просмотреть файл

@ -1423,8 +1423,8 @@ public:
m_gradientInitialized = true;
}
// resize and reset this node's gradient to a given matrix's value
void ResetGradient(const Matrix<ElemType>& val)
// Assign the given matrix's value to this node's gradient. The matrix sizes must match.
void AssignGradient(const Matrix<ElemType>& val)
{
UpdateDataSize(Gradient());

Просмотреть файл

@ -67,6 +67,8 @@ template class PlusNode<double>;
// -----------------------------------------------------------------------
// LogPlusNode (summand1, summand2)
// Computes ln(exp(summand1) + exp(summand2)) in an overflow safe way.
// Useful e.g. for computing softmax over sequence.
// -----------------------------------------------------------------------
template <class ElemType>
@ -105,8 +107,16 @@ public:
if (Input(inputIndex)->ReducesInTimeWrt(Input(1 - inputIndex)))
Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
// TODO: would be nice to state the derivative here in a comment
inputGradient.AddElementwiseProductWithLogSumDerivativeOf(gradient, input0, input1);
if (inputIndex == 0)
{
// d/dx (ln( exp(x) + (exp(y)) = exp(x) / (exp(x) + exp(y)) = 1 / (1 + exp(y-x)) = sigmoid(x-y)
inputGradient.AddElementwiseProductWithLogSumDerivativeOf(gradient, input1, input0);
}
else
{
// d/dy (ln( exp(x) + (exp(y)) = exp(y) / (exp(x) + exp(y)) = 1 / (1 + exp(x-y)) = sigmoid(y-x)
inputGradient.AddElementwiseProductWithLogSumDerivativeOf(gradient, input0, input1);
}
}
};

Просмотреть файл

@ -8,7 +8,8 @@
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include "stdafx.h"
#include <stdio.h>
#include <math.h>
#define EVAL_EXPORTS // creating the exports here
#include "Eval.h"
#include "Actions.h"
@ -26,6 +27,7 @@
#include "NoRandomizer.h"
#include "HeapMemoryProvider.h"
#include "InputAndParamNodes.h"
#include "latticearchive.h"
// TODO: Temporary mechanism to enable memory sharing for
// node output value matrices. This will go away when the
@ -99,6 +101,8 @@ extern "C" EVAL_API void GetEvalD(IEvaluateModel<double>** peval)
template <typename ElemType>
void CNTKEval<ElemType>::GetNodeDimensions(std::map<std::wstring, size_t>& dimensions, NodeGroup nodeGroup)
{
// On Linux with gcc 4.8.4, it is required to add "this->" when referencing m_net, which is the protected member of the base class with templates,
// in order to make the name correctly resolved by the compiler.
if (this->m_net == NULL)
{
for (auto iter = dimensions.begin(); iter != dimensions.end(); iter++)
@ -317,15 +321,17 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
RuntimeError("Expected %d outputs, but got %d.", (int)m_outputNodes.size(), (int)outputs.size());
size_t i = 0;
for (auto& input : m_inputMatrices)
for (auto& inputNode : m_inputNodes)
{
// const cast: The matrix class takes this over without copying and could theoretically change the contents,
// though it doesn't in this case.
auto& buffer = const_cast<ValueBuffer<ElemType, ValueContainer>&>(inputs[i]);
shared_ptr<Matrix<ElemType>> matrix = dynamic_pointer_cast<Matrix<ElemType>>(input.second.matrix);
auto matrix = dynamic_pointer_cast<Matrix<ElemType>>(inputNode->ValuePtr());
auto type = matrix->GetMatrixType();
size_t numRows = input.second.sampleLayout.GetNumElements();
size_t numRows = inputNode->GetSampleLayout().GetNumElements();
if (buffer.m_buffer.data() == nullptr)
RuntimeError("Input %ls: Buffer is not allocated.", m_inputNodes[i]->GetName().c_str());
if (type == MatrixType::DENSE)
{
if (buffer.m_buffer.size() % numRows != 0)
@ -336,8 +342,12 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
}
else if (type == MatrixType::SPARSE)
{
if (buffer.m_colIndices.data() == nullptr)
RuntimeError("Input %ls: Due to sparse input format, expected colIndices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
if (buffer.m_indices.data() == nullptr)
RuntimeError("Input %ls: Due to sparse input format, expected Indices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
if (buffer.m_colIndices.size() < 2)
RuntimeError("Input %ls: Expected at least one element.", m_inputNodes[i]->GetName().c_str());
RuntimeError("Input %ls: Expected at least one element (2 entries in colIndices array).", m_inputNodes[i]->GetName().c_str());
if (buffer.m_colIndices[0] != 0)
RuntimeError("Input %ls: First element of column indices must be 0", m_inputNodes[i]->GetName().c_str());
if (buffer.m_colIndices[buffer.m_colIndices.size() - 1] != buffer.m_indices.size())
@ -348,8 +358,8 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
int numCols = type == MatrixType::DENSE ? buffer.m_buffer.size() / numRows : buffer.m_colIndices.size() - 1;
assert(numCols >= 1);
input.second.pMBLayout->Init(1, numCols);
input.second.pMBLayout->AddSequence(0, 0, 0, numCols);
inputNode->GetMBLayout()->Init(1, numCols);
inputNode->GetMBLayout()->AddSequence(0, 0, 0, numCols);
if (type == MatrixType::DENSE)
matrix->SetValue(numRows, numCols, matrix->GetDeviceId(), buffer.m_buffer.data(), matrixFlagNormal);

Просмотреть файл

@ -134,8 +134,6 @@
<ClInclude Include="..\Common\Include\TimerUtility.h" />
<ClInclude Include="EvalReader.h" />
<ClInclude Include="EvalWriter.h" />
<ClInclude Include="stdafx.h" />
<ClInclude Include="targetver.h" />
<ClInclude Include="CNTKEval.h" />
</ItemGroup>
<ItemGroup>
@ -146,12 +144,9 @@
<PrecompiledHeader>
</PrecompiledHeader>
</ClCompile>
<ClCompile Include="stdafx.cpp">
<PrecompiledHeader>Create</PrecompiledHeader>
</ClCompile>
<ClCompile Include="CNTKEval.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

Просмотреть файл

@ -5,9 +5,6 @@
<ClCompile Include="dllmain.cpp">
<Filter>Misc</Filter>
</ClCompile>
<ClCompile Include="stdafx.cpp">
<Filter>Misc</Filter>
</ClCompile>
<ClCompile Include="..\CNTK\BrainScript\BrainScriptEvaluator.cpp">
<Filter>BrainScript</Filter>
</ClCompile>
@ -31,12 +28,6 @@
<ClInclude Include="..\Common\Include\Basics.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="stdafx.h">
<Filter>Misc</Filter>
</ClInclude>
<ClInclude Include="targetver.h">
<Filter>Misc</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\Config.h">
<Filter>Common\Include</Filter>
</ClInclude>

Просмотреть файл

@ -4,7 +4,7 @@
//
// dllmain.cpp : Defines the entry point for the DLL application.
//
#include "stdafx.h"
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX

Просмотреть файл

@ -1,8 +0,0 @@
// stdafx.cpp : source file that includes just the standard includes
// ParseNumber.pch will be the pre-compiled header
// stdafx.obj will contain the pre-compiled type information
#include "stdafx.h"
// TODO: reference any additional headers you need in STDAFX.H
// and not in this file

Просмотреть файл

@ -1,17 +0,0 @@
// stdafx.h : include file for standard system include files,
// or project specific include files that are used frequently, but
// are changed infrequently
//
#pragma once
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
#endif
#ifdef _WIN32
#include "targetver.h"
#endif
#include <stdio.h>
#include <math.h>
// TODO: reference additional headers your program requires here

Просмотреть файл

@ -1,8 +0,0 @@
#pragma once
// Including SDKDDKVer.h defines the highest available Windows platform.
// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
#include <SDKDDKVer.h>

Просмотреть файл

@ -1,12 +0,0 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// stdafx.cpp : source file that includes just the standard includes
// CPPEvalClient.pch will be the pre-compiled header
// stdafx.obj will contain the pre-compiled type information
#include "stdafx.h"
// TODO: reference any additional headers you need in STDAFX.H
// and not in this file

Просмотреть файл

@ -1,19 +0,0 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// stdafx.h : include file for standard system include files,
// or project specific include files that are used frequently, but
// are changed infrequently
//
#pragma once
#include "targetver.h"
#include <stdio.h>
#include <tchar.h>
#include "targetver.h"
// This is a windows only application
#include "Windows.h"

Просмотреть файл

@ -1,13 +0,0 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
// Including SDKDDKVer.h defines the highest available Windows platform.
// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
#include <SDKDDKVer.h>

Просмотреть файл

@ -1,80 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProjectGuid>{41E11A59-62B2-4927-A4F8-F40B1B612C6C}</ProjectGuid>
<OutputType>Exe</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient</RootNamespace>
<AssemblyName>CSEvalClient</AssemblyName>
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>..\..\..\x64\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x64</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>true</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug_CpuOnly|x64'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>..\..\..\x64\Debug_CpuOnly\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x64</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>true</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'">
<OutputPath>..\..\..\x64\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x64</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>true</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release_CpuOnly|x64'">
<OutputPath>..\..\..\x64\Release_CpuOnly\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x64</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>true</Prefer32Bit>
</PropertyGroup> <ItemGroup>
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="Microsoft.CSharp" />
</ItemGroup>
<ItemGroup>
<Compile Include="Program.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="App.config" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\EvalWrapper\EvalWrapper.vcxproj">
<Project>{ef766cae-9cb1-494c-9153-0030631a6340}</Project>
<Name>EvalWrapper</Name>
</ProjectReference>
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

Просмотреть файл

@ -0,0 +1,105 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// CNTKException.h -- Managed CNTK Exception wrappers
//
#include "ExceptionWithCallStack.h"
using namespace std;
using namespace System;
using namespace System::Collections::Generic;
using namespace System::Collections;
using namespace System::Runtime::Serialization;
using namespace Microsoft::MSR::CNTK;
namespace Microsoft { namespace MSR { namespace CNTK { namespace Extensibility { namespace Managed {
[Serializable]
public ref class CNTKException : Exception, ISerializable
{
public:
CNTKException() : Exception()
{}
CNTKException(String^ message) : Exception(message)
{}
CNTKException(String^ message, String^ callstack) : Exception(message), NativeCallStack(callstack)
{}
const String^ NativeCallStack;
protected:
CNTKException(SerializationInfo^ info, StreamingContext context) : Exception(info, context)
{}
};
[Serializable]
public ref class CNTKRuntimeException : CNTKException
{
public:
CNTKRuntimeException() : CNTKException()
{}
CNTKRuntimeException(String^ message, String^ callstack) : CNTKException(message, callstack)
{}
protected:
CNTKRuntimeException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
{}
};
[Serializable]
public ref class CNTKLogicErrorException : CNTKException
{
public:
CNTKLogicErrorException() : CNTKException()
{}
CNTKLogicErrorException(String^ message, String^ callstack) : CNTKException(message, callstack)
{}
protected:
CNTKLogicErrorException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
{}
};
[Serializable]
public ref class CNTKInvalidArgumentException : CNTKException
{
public:
CNTKInvalidArgumentException() : CNTKException()
{}
CNTKInvalidArgumentException(String^ message, String^ callstack) : CNTKException(message, callstack)
{}
protected:
CNTKInvalidArgumentException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
{}
};
[Serializable]
public ref class CNTKBadAllocException : CNTKException
{
public:
CNTKBadAllocException() : CNTKException()
{}
CNTKBadAllocException(String^ message) : CNTKException(message)
{}
protected:
CNTKBadAllocException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
{}
};
}}}}}

Просмотреть файл

@ -0,0 +1,31 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// EvalCommon.h -- Common structures used by managed code wrapping the native EvaluateModel interface
//
namespace Microsoft { namespace MSR { namespace CNTK { namespace Extensibility { namespace Managed {
/// Enumeration for the types of nodes
public enum class NodeGroup
{
Input, // an input node
Output, // an output node
Specified
};
public enum class DataType
{
Float32,
Float64
};
public enum class StorageType
{
Unknown,
Dense,
Sparse,
};
}}}}}

Просмотреть файл

@ -0,0 +1,558 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// EvalExtendedWrapper.cpp -- Managed code wrapping the native EvaluateExtendedModel interface
//
#include <windows.h>
#include <vcclr.h>
#include <string>
#include <utility>
#include <vector>
#include <memory>
#include <msclr\marshal_cppstd.h>
#include "CNTKException.h"
#pragma warning(push)
#pragma warning(disable : 4793) // Function compiled as native
#include "Basics.h"
#include "ScriptableObjects.h"
#pragma warning(pop)
#include "EvalCommon.h"
#include "Eval.h"
#using <System.dll>
#using <System.Collections.dll>
#using <System.IO.dll>
#using <System.Reflection.dll>
using namespace std;
using namespace System;
using namespace System::Collections::Generic;
using namespace System::Collections;
namespace Microsoft { namespace MSR { namespace CNTK { namespace Extensibility { namespace Managed {
namespace Native = Microsoft::MSR::CNTK;
// Used for retrieving the appropriate model for the element type (float / double)
template<typename ElemType>
using GetEvalProc = void(*)(IEvaluateModelExtended<ElemType>**);
//
// A buffer to keep data for all samples in a (variable length) sequence
// from a single input or output.
// This is used for both dense and sparse data.
//
generic<class ElemType>
public ref class ValueBuffer
{
public:
ValueBuffer()
{
Size = 0;
}
//
// Init for Dense
//
ValueBuffer(int bufferSize)
{
Buffer = gcnew array<ElemType>(bufferSize);
Size = bufferSize;
}
//
// Init for Sparse
//
ValueBuffer(int bufferSize, int colIndicesSize)
{
Buffer = gcnew array<ElemType>(bufferSize);
Indices = gcnew array<int>(bufferSize);
ColIndices = gcnew array<int>(colIndicesSize);
Size = colIndicesSize - 1;
}
//
// For dense, this is the length of Buffer (in nr. of ElemTypes).
// For sparse, this is the length of ColIndices (i.e. the number of columns + 1).
// This allows Buffer / Indices / ColIndices to be larger than Size to avoid
// reallocation.
//
property int Size;
//
// All elements of a sequence, concatenated.
// For dense inputs, the number of samples is given by the the length of
// this vector / product of tensor dimensions. E.g. for a tensor of dimension
// [2,2] and 12 elements in the buffer, the number of samples is 3.
// For sparse inputs, the number of samples is indicated by the ColIndices field.
//
property array<ElemType>^ Buffer;
// In case of sparse data, the following is also used. Otherwise, the
// contents are ignored.
// E.g. a sequence of three sparse vectors with 2 / 4 / 2 non-zero values
// could be represented as the following:
// colIdx: 0 2 6 8
// v v v v
// indices 1 3 2 3 5 6 2 7
// buffer 0 1 2 3 4 5 6 7
//
// For every element in buffer, an entry in this array gives its position.
// For every vector the entries must be ascending.
//
property array<int>^ Indices;
//
// Contains numberOfsamples + 1 indices into the buffer. The first entry
// is always 0. The last entry points after the last element.
// See http://docs.nvidia.com/cuda/cusparse/#compressed-sparse-column-format-csc
//
property array<int>^ ColIndices;
// TODO: Should it have a read-only StorageType property?
};
//
// Meta data
//
public ref struct VariableLayout
{
// Name of the input
property String^ Name;
property DataType DataType;
property StorageType StorageType;
// Dimension of the tensor, flattened to 1 dimension, for one entry on the dynamic axis.
// E.g. for a tensor [2,3,*] this would be 6.
property int NumElements;
};
public ref class VariableSchema : List<VariableLayout^>
{
public:
generic<typename ElemType>
array<ValueBuffer<ElemType>^>^ CreateBuffers(... array<int>^ maxLengths)
{
if (maxLengths->Length == 0)
{
maxLengths = gcnew array<int>(this->Count);
for (int i = 0; i<maxLengths->Length; i++)
{
maxLengths[i] = 1;
}
}
if (maxLengths->Length != this->Count)
{
throw gcnew CNTKRuntimeException("Expected max lengths for all variables.", String::Empty);
}
array<ValueBuffer<ElemType>^>^ buffers = gcnew array<ValueBuffer<ElemType>^>(this->Count);
for (int i = 0; i < this->Count; i++)
{
buffers[i] = gcnew ValueBuffer<ElemType>(this[i]->NumElements * maxLengths[i]);
}
return buffers;
}
};
/// Managed wrapper for the native evaluation model
template<typename ElemType>
public ref class ModelEvaluationExtended : IDisposable
{
typedef std::pair<std::wstring, std::vector<ElemType>*> MapEntry;
typedef std::shared_ptr<Native::ValueBuffer<ElemType, Native::VectorRef>> ValueBufferPtr;
public:
/// <summary>Initializes a new instance of the <see cref="ModelEvaluationExtended"> class.</summary>
/// <param name="funcName">Factory function name for retrieving the native model from the dll.</param>
ModelEvaluationExtended(String^ funcName)
{
try
{
pin_ptr <IEvaluateModelExtended<ElemType>*> p_eval = &m_eval;
GetEvalExtended<ElemType>(p_eval);
}
catch (const exception& ex)
{
throw gcnew CNTKException(gcnew System::String(ex.what()));
}
}
/// <summary>Creates a network based on the network description in the configuration</summary>
/// <param name="networkDescription">The configuration file containing the network description</param>
void CreateNetwork(String^ networkDescription)
{
if (m_eval == nullptr)
{
throw gcnew ObjectDisposedException("Object has been disposed.");
}
msclr::interop::marshal_context context;
const std::string stdNetworkDescription = context.marshal_as<std::string>(networkDescription);
try
{
m_eval->CreateNetwork(stdNetworkDescription);
}
catch (const exception& ex)
{
throw GetCustomException(ex);
}
}
//
// GetInputSchema - retrieve information about tensor shapes and memory layout for this model.
//
VariableSchema^ GetInputSchema()
{
if (m_eval == nullptr)
{
throw gcnew ObjectDisposedException("Object has been disposed.");
}
return ConvertNativeSchemaToManaged(m_eval->GetInputSchema());
}
//
// GetOutputSchema - retrieve information about tensor shapes and memory layout for this model.
//
VariableSchema^ GetOutputSchema()
{
if (m_eval == nullptr)
{
throw gcnew ObjectDisposedException("Object has been disposed.");
}
return ConvertNativeSchemaToManaged(m_eval->GetOutputSchema());
}
//
// Allocate internal state for calling ForwardPass(). The call restricts the network (inputs and outputs)
// to the functions represented by the output name.
//
void StartForwardEvaluation(List<String^>^ outputs)
{
if (m_eval == nullptr)
{
throw gcnew ObjectDisposedException("Object has been disposed.");
}
std::vector<wstring> outputNodeNames;
msclr::interop::marshal_context context;
for each (String^ output in outputs)
{
outputNodeNames.push_back(context.marshal_as<std::wstring>(output));
}
try
{
m_eval->StartForwardEvaluation(outputNodeNames);
}
catch (const exception& ex)
{
throw GetCustomException(ex);
}
}
//
// Forward Pass - Evaluate (perform a forward pass for) a single unit using the model with the given inputs and
// outputs.
// The layout and shape of the data in inputs vector must match the schema returned by GetInputLayouts.
// This method is not reentrant, as the forward pass keeps internal state.
// outputId - output to compute values for. See GetOutputLayouts()
// inputs - vector of input buffers, one for every input as given by GetInputLayouts()
// outputs - map from node name to output vector, outputs vectors need to be preallocated by caller
// Called after StartForwardEvaluation()
//
void ForwardPass(array<ValueBuffer<ElemType>^>^ inputs, array<ValueBuffer<ElemType>^>^ outputs)
{
if (m_eval == nullptr)
{
throw gcnew ObjectDisposedException("Object has been disposed.");
}
try
{
Native::ValueRefs<ElemType> stdInputs;
Native::ValueRefs<ElemType> stdOutputs;
// Hold gc objects in the stack, while performing native actions
vector<gcroot<array<ElemType>^>> pinBuffers;
vector<gcroot<array<int>^>> pinIndices;
// Map the managed space into the native space, results will be written directly into the managed memory space
// https://msdn.microsoft.com/en-us/library/1dz8byfh.aspx
TransferVectorsToValueBuffers(inputs, stdInputs, pinBuffers, pinIndices, StorageType::Sparse);
TransferVectorsToValueBuffers(outputs, stdOutputs, pinBuffers, pinIndices, StorageType::Dense);
try
{
m_eval->ForwardPass(stdInputs, stdOutputs);
// Update actual output size.
for (int i = 0; i < outputs->Length; ++i)
{
outputs[i]->Size = (int)stdOutputs[i].m_buffer.m_size;
}
}
catch (const exception& ex)
{
throw GetCustomException(ex);
}
}
catch (Exception^)
{
throw;
}
}
~ModelEvaluationExtended()
{
if (m_eval == nullptr)
{
return;
}
this->!ModelEvaluationExtended();
}
protected:
!ModelEvaluationExtended()
{
if (m_eval != nullptr)
{
m_eval->Destroy();
m_eval = nullptr;
}
}
private:
// Native model evaluation instance
IEvaluateModelExtended<ElemType> *m_eval;
/// <summary> Throws a CLR exception based on a native exception</summary>
/// <param name="ex">The native exception to throw as a CLR exception</param>
/// <returns>A CLR exception</returns>
CNTKException^ GetCustomException(const exception& ex)
{
// Determine the appropriate exception and initialize it with the exception payload
if (typeid(ex) == typeid(ExceptionWithCallStack<runtime_error>))
{
ExceptionWithCallStack<runtime_error>& rich = dynamic_cast<ExceptionWithCallStack<runtime_error>&>((runtime_error&)ex);
return gcnew CNTKRuntimeException(gcnew System::String(rich.what()), gcnew System::String(rich.CallStack()));
}
else if (typeid(ex) == typeid(ExceptionWithCallStack<logic_error>))
{
ExceptionWithCallStack<logic_error>& rich = dynamic_cast<ExceptionWithCallStack<logic_error>&>((logic_error&)ex);
return gcnew CNTKLogicErrorException(gcnew System::String(ex.what()), gcnew System::String(rich.CallStack()));
}
else if (typeid(ex) == typeid(ExceptionWithCallStack<invalid_argument>))
{
ExceptionWithCallStack<invalid_argument>& rich = dynamic_cast<ExceptionWithCallStack<invalid_argument>&>((invalid_argument&)ex);
return gcnew CNTKInvalidArgumentException(gcnew System::String(ex.what()), gcnew System::String(rich.CallStack()));
}
else if (typeid(ex) == typeid(bad_alloc))
{
return gcnew CNTKBadAllocException(gcnew System::String(ex.what()));
}
else if (dynamic_cast<const ScriptableObjects::ScriptingException*>(&ex) != nullptr) // Includes derived classes
{
const auto& err = dynamic_cast<const ScriptableObjects::ScriptingException&>(ex);
return gcnew CNTKLogicErrorException(gcnew System::String(wstrprintf(L"%ls\n%ls", utf16(err.what()).c_str(), err.GetError(L"").c_str()).c_str()), nullptr);
}
else
{
return gcnew CNTKException(gcnew System::String(ex.what()));
}
}
/// <summary Converts a managed (CLI) enum NodeGroup to a native NodeGroup
/// <param name="nodeGroup">The managed (CLI) NodeGroup to convert to native</param>
Native::NodeGroup GetNodeGroup(NodeGroup nodeGroup)
{
switch ((int)nodeGroup)
{
case Native::NodeGroup::nodeInput:
return Native::NodeGroup::nodeInput;
case Native::NodeGroup::nodeOutput:
return Native::NodeGroup::nodeOutput;
case Native::NodeGroup::nodeSpecified:
return Native::NodeGroup::nodeSpecified;
default:
throw gcnew CNTKRuntimeException(String::Format("Cannot convert native NodeGroup with value: {0} to corresponding managed NodeGroup.", (int)nodeGroup), "");
}
}
DataType GetDataType(Microsoft::MSR::CNTK::VariableLayout::DataType dataType)
{
switch ((int)dataType)
{
case DataType::Float32:
return DataType::Float32;
case DataType::Float64:
return DataType::Float64;
default:
throw gcnew CNTKRuntimeException(String::Format("Cannot convert native DataType with value: {0} to corresponding managed DataType.", (int)dataType), "");
}
}
StorageType GetStorageType(Microsoft::MSR::CNTK::VariableLayout::StorageType storageType)
{
switch ((int)storageType)
{
case StorageType::Dense:
return StorageType::Dense;
case StorageType::Sparse:
return StorageType::Sparse;
case StorageType::Unknown:
return StorageType::Unknown;
default:
throw gcnew CNTKRuntimeException(String::Format("Cannot convert native StorageType with value: {0} to corresponding managed StorageType.", (int)storageType), "");
}
}
void PinBuffer(array<ElemType>^ itemBuffer, vector<gcroot<array<ElemType>^>>& pinBuffers, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
{
// gcroot object manages the pointer so that it always corresponds to the correct managed location (even after gc relocation)
gcroot<array<ElemType>^> pBuf(itemBuffer);
pin_ptr<ElemType> pp = &(pBuf[0]);
pinBuffers.push_back(pBuf);
vb->m_buffer.InitFrom(pp, bufferSize, storageType == StorageType::Sparse ? bufferSize : 0);
pp = nullptr;
}
void PinIndices(array<int>^ itemBuffer, vector<gcroot<array<int>^>>& pinBuffers, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
{
// gcroot object manages the pointer so that it always corresponds to the correct managed location (even after gc relocation)
gcroot<array<int>^> pBuf(itemBuffer);
pin_ptr<int> pp = &(pBuf[0]);
pinBuffers.push_back(pBuf);
vb->m_indices.InitFrom(pp, bufferSize, storageType == StorageType::Sparse ? bufferSize : 0);
pp = nullptr;
}
void PinColIndices(array<int>^ itemBuffer, vector<gcroot<array<int>^>>& pinBuffers, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
{
// gcroot object manages the pointer so that it always corresponds to the correct managed location (even after gc relocation)
gcroot<array<int>^> pBuf(itemBuffer);
pin_ptr<int> pp = &(pBuf[0]);
pinBuffers.push_back(pBuf);
vb->m_colIndices.InitFrom(pp, bufferSize, storageType == StorageType::Sparse ? bufferSize : 0);
pp = nullptr;
}
void TransferVectorsToValueBuffers(array<ValueBuffer<ElemType>^>^ list, Native::ValueRefs<ElemType>& valueRefs, vector<gcroot<array<ElemType>^>>& pinBuffers, vector<gcroot<array<int>^>>& pinIndices, StorageType storageType)
{
for each (auto item in list)
{
Native::ValueBuffer<ElemType, Native::VectorRef> vb;
int numElements = item->Size;
int bufferSize = item->ColIndices != nullptr ? item->ColIndices[item->Size - 1] : item->Size;
// Buffer is required
if (item->Buffer == nullptr)
{
throw gcnew CNTKRuntimeException("Invalid buffer (empty) for argument into ForwardPass", String::Empty);
}
PinBuffer(item->Buffer, pinBuffers, &vb, storageType, bufferSize);
if (item->Indices != nullptr)
{
PinIndices(item->Indices, pinIndices, &vb, storageType, bufferSize);
}
if (item->ColIndices != nullptr)
{
PinColIndices(item->ColIndices, pinIndices, &vb, storageType, numElements);
}
valueRefs.push_back(vb);
}
}
//
// ConvertNativeSchemaToManaged - Converts a native schema to a manged one
//
VariableSchema^ ConvertNativeSchemaToManaged(Native::VariableSchema layouts)
{
if (m_eval == nullptr)
{
throw gcnew ObjectDisposedException("Object has been disposed.");
}
auto schema = gcnew VariableSchema();
for (auto& lay : layouts)
{
VariableLayout^ varlayout = gcnew VariableLayout();
varlayout->Name = gcnew String(lay.m_name.c_str());
varlayout->DataType = GetDataType(lay.m_dataType);
varlayout->NumElements = static_cast<int>(lay.m_numElements);
varlayout->StorageType = GetStorageType(lay.m_storageType);
schema->Add(varlayout);
}
return schema;
}
};
/// <summary>Managed float-specific model evaluation class</summary>
/// <remarks>This class is necessary due to how generics and templates work in CLR</remarks>
public ref class ModelEvaluationExtendedF : ModelEvaluationExtended<float>
{
public:
ModelEvaluationExtendedF::ModelEvaluationExtendedF()
: ModelEvaluationExtended("GetEvalExtendedF")
{
}
};
/// <summary>Managed double-specific model evaluation class</summary>
/// <remarks>This class is necessary due to how generics and templates work in CLR</remarks>
public ref class ModelEvaluationExtendedD : ModelEvaluationExtended<double>
{
public:
ModelEvaluationExtendedD::ModelEvaluationExtendedD()
: ModelEvaluationExtended("GetEvalExtendedD")
{
}
};
// This method tricks the compiler into emitting the methods of the classes
// Refer to https://msdn.microsoft.com/en-us/library/ms177213.aspx for an
// explanation to this behavior
void EmitExtended()
{
ModelEvaluationExtendedF f;
f.CreateNetwork("");
f.GetInputSchema();
f.GetOutputSchema();
f.StartForwardEvaluation(nullptr);
f.ForwardPass(nullptr, nullptr);
ModelEvaluationExtendedD d;
d.CreateNetwork("");
d.GetInputSchema();
d.GetOutputSchema();
d.StartForwardEvaluation(nullptr);
d.ForwardPass(nullptr, nullptr);
VariableSchema sc;
sc.CreateBuffers<float>();
sc.CreateBuffers<double>();
}
}}}}}

Просмотреть файл

@ -13,7 +13,8 @@
#include <memory>
#include <msclr\marshal_cppstd.h>
#include "ExceptionWithCallStack.h"
#include "CNTKException.h"
#include "EvalCommon.h"
#include "Eval.h"
#using <System.dll>
@ -23,25 +24,14 @@ using namespace std;
using namespace System;
using namespace System::Collections::Generic;
using namespace System::Collections;
using namespace System::Runtime::Serialization;
using namespace Microsoft::MSR::CNTK;
namespace Microsoft { namespace MSR { namespace CNTK { namespace Extensibility { namespace Managed {
ref class CNTKException;
// Used for retrieving the model appropriate for the element type (float / double)
template<typename ElemType>
using GetEvalProc = void(*)(IEvaluateModel<ElemType>**);
/// Enumeration for the types of nodes
public enum class NodeGroup
{
nodeInput, // an input node
nodeOutput, // an output node
nodeSpecified
};
/// Managed wrapper for the native evaluation model
template<typename ElemType>
public ref class IEvaluateModelManaged : IDisposable
@ -53,21 +43,10 @@ public:
/// <param name="funcName">Factory function name for retrieving the native model from the dll.</param>
IEvaluateModelManaged(String^ funcName)
{
pin_ptr<const WCHAR> dllname = PtrToStringChars("evaldll.dll");
auto hModule = LoadLibrary(dllname);
if (hModule == nullptr)
{
throw gcnew CNTKException(System::String::Format("Cannot find library: {0}", gcnew String(dllname)));
}
try
{
msclr::interop::marshal_context context;
const std::string func = context.marshal_as<std::string>(funcName);
auto procAddress = GetProcAddress(hModule, func.c_str());
auto getEvalProc = (GetEvalProc<ElemType>)procAddress;
pin_ptr <IEvaluateModel<ElemType>*> p_eval = &m_eval;
getEvalProc(p_eval);
GetEval<ElemType>(p_eval);
}
catch (const exception& ex)
{
@ -248,7 +227,7 @@ public:
try
{
std::vector<shared_ptr<std::vector<ElemType>>> sharedOutputVectors;
int outputSize = GetNodeDimensions(NodeGroup::nodeOutput)[outputKey];
int outputSize = GetNodeDimensions(NodeGroup::Output)[outputKey];
List<ElemType>^ outputs = gcnew List<ElemType>(outputSize);
for (int i = 0; i < outputSize; i++)
@ -394,7 +373,7 @@ public:
/// <returns>Results for requested layer</returns>
List<ElemType>^ Evaluate(Dictionary<String^, List<ElemType>^>^ inputs, String^ outputKey)
{
auto outDims = GetNodeDimensions(NodeGroup::nodeOutput);
auto outDims = GetNodeDimensions(NodeGroup::Output);
int outputSize = outDims[outputKey];
List<ElemType>^ outputs = gcnew List<ElemType>(outputSize);
@ -556,100 +535,6 @@ public:
}
};
[Serializable]
public ref class CNTKException : Exception, ISerializable
{
public:
CNTKException() : Exception()
{}
CNTKException(String^ message) : Exception(message)
{}
CNTKException(String^ message, String^ callstack) : Exception(message), NativeCallStack(callstack)
{}
const String^ NativeCallStack;
[System::Security::Permissions::SecurityPermissionAttribute
(System::Security::Permissions::SecurityAction::LinkDemand,
Flags = System::Security::Permissions::SecurityPermissionFlag::SerializationFormatter)]
virtual void GetObjectData(SerializationInfo^ info, StreamingContext context) override
{
Exception::GetObjectData(info, context);
}
protected:
CNTKException(SerializationInfo^ info, StreamingContext context) : Exception(info, context)
{}
};
[Serializable]
public ref class CNTKRuntimeException : CNTKException
{
public:
CNTKRuntimeException() : CNTKException()
{}
CNTKRuntimeException(String^ message, String^ callstack) : CNTKException(message, callstack)
{}
protected:
CNTKRuntimeException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
{}
};
[Serializable]
public ref class CNTKLogicErrorException : CNTKException
{
public:
CNTKLogicErrorException() : CNTKException()
{}
CNTKLogicErrorException(String^ message, String^ callstack) : CNTKException(message, callstack)
{}
protected:
CNTKLogicErrorException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
{}
};
[Serializable]
public ref class CNTKInvalidArgumentException : CNTKException
{
public:
CNTKInvalidArgumentException() : CNTKException()
{}
CNTKInvalidArgumentException(String^ message, String^ callstack) : CNTKException(message, callstack)
{}
protected:
CNTKInvalidArgumentException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
{}
};
[Serializable]
public ref class CNTKBadAllocException : CNTKException
{
public:
CNTKBadAllocException() : CNTKException()
{}
CNTKBadAllocException(String^ message) : CNTKException(message)
{}
protected:
CNTKBadAllocException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
{}
};
// This method tricks the compiler into emitting the methods of the classes
// Refer to https://msdn.microsoft.com/en-us/library/ms177213.aspx for an
// explanation to this behavior
@ -667,7 +552,7 @@ void emit()
f.CreateNetwork("", 0);
f.CreateNetwork("", nullptr);
f.CreateNetwork("", 0, nullptr);
f.GetNodeDimensions(NodeGroup::nodeSpecified);
f.GetNodeDimensions(NodeGroup::Specified);
IEvaluateModelManagedD d;
d.Init("");
@ -678,7 +563,7 @@ void emit()
d.CreateNetwork("", 0);
d.CreateNetwork("", nullptr);
d.CreateNetwork("", 0,nullptr);
d.GetNodeDimensions(NodeGroup::nodeSpecified);
d.GetNodeDimensions(NodeGroup::Specified);
// Deprecated code, hush warnings locally only
#pragma warning(push)

Просмотреть файл

@ -56,6 +56,8 @@
</ClCompile>
<Link>
<AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
<AdditionalDependencies>EvalDLL.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<DelayLoadDLLs>EvalDll.dll</DelayLoadDLLs>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(DebugBuild)">
@ -66,8 +68,6 @@
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<DelayLoadDLLs>
</DelayLoadDLLs>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -77,15 +77,16 @@
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<DelayLoadDLLs>
</DelayLoadDLLs>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="CNTKException.h" />
<ClCompile Include="EvalExtendedWrapper.cpp" />
<ClCompile Include="EvalWrapper.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\Include\Eval.h" />
<ClInclude Include="EvalCommon.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">

Просмотреть файл

@ -16,10 +16,19 @@
<ClInclude Include="..\..\Common\Include\Eval.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="EvalCommon.h">
<Filter>Source Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="EvalWrapper.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="CNTKException.h">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="EvalExtendedWrapper.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,48 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
//
#include "stdafx.h"
#include <malloc.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <assert.h>
#include <iostream>
#include <exception>
#include "BlockMultiplierMatrixUtil.h"
#include "BlockHandlerAVX.h"
namespace Microsoft { namespace MSR { namespace CNTK {
int BlockHandlerAVX::RowToColOffsetRewrittenA(int row, int kOffset, int blockSize, int rowsPerBlock, int origCols)
{
int rowIdx = row / rowsPerBlock;
int offsetFromBlockBeginning = row % rowsPerBlock;
int colIdx = kOffset * rowsPerBlock * blockSize + (offsetFromBlockBeginning * blockSize);
return (rowIdx * (origCols / blockSize) * rowsPerBlock * blockSize) + colIdx;
}
//col is the original column of B
//kOffset is the offset to the current block we are multiplying against (in absolute
int BlockHandlerAVX::RowToColOffsetRewrittenB(int col, int kOffset, int blockSize, int origCols)
{
return (origCols * blockSize * kOffset) + (col * blockSize);
}
void BlockHandlerAVX::DumpM256(__m256i dumpMe)
{
union { int32_t i[8]; __m256i y; } u;
u.y = dumpMe;
for (int i = 0; i < 8; ++i)
{
std::cout << u.i[i] << " ";
}
std::cout << std::endl;
}
}}}

Просмотреть файл

@ -0,0 +1,961 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
//
#pragma once
#include "BlockMultiplierPlatform.h"
#include <immintrin.h>
#include <emmintrin.h>
#include <assert.h>
#include <cstdint>
#define FOR_CNTK
#ifdef FOR_CNTK
#include "CommonMatrix.h"
#endif
namespace Microsoft { namespace MSR { namespace CNTK {
class MATH_API BlockHandlerAVX
{
private:
//USE SSE for the blocks of 8, borrowed from BlockHandlerSSE
FORCEINLINE static void kernelsse8x4(__m128i xmmRow0, __m128i xmmRow1, __m128i xmmRow2, __m128i xmmRow3,
short* B, __m128i* return1, __m128i* return2, __m128i* return3, __m128i* return4);
FORCEINLINE static void kernelavx16x4(__m256i xmmRow0B0a, __m256i xmmRow1B0a, __m256i xmmRow2B0a, __m256i xmmRow3B0a,
short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4);
FORCEINLINE static void kernelavx32x4(
__m256i xmmRow0B0a, __m256i xmmRow0B0b,
__m256i xmmRow1B0a, __m256i xmmRow1B0b,
__m256i xmmRow2B0a, __m256i xmmRow2B0b,
__m256i xmmRow3B0a, __m256i xmmRow3B0b,
short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4);
FORCEINLINE static void kernelavx64x4(
__m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
__m256i xmmRow1B0a, __m256i xmmRow1B0b, __m256i xmmRow1B0c, __m256i xmmRow1B0d,
__m256i xmmRow2B0a, __m256i xmmRow2B0b, __m256i xmmRow2B0c, __m256i xmmRow2B0d,
__m256i xmmRow3B0a, __m256i xmmRow3B0b, __m256i xmmRow3B0c, __m256i xmmRow3B0d,
short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4);
FORCEINLINE static void kernelavx128x4(
__m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
__m256i xmmRow0B0e, __m256i xmmRow0B0f, __m256i xmmRow0B0g, __m256i xmmRow0B0h,
__m256i xmmRow1B0a, __m256i xmmRow1B0b, __m256i xmmRow1B0c, __m256i xmmRow1B0d,
__m256i xmmRow1B0e, __m256i xmmRow1B0f, __m256i xmmRow1B0g, __m256i xmmRow1B0h,
__m256i xmmRow2B0a, __m256i xmmRow2B0b, __m256i xmmRow2B0c, __m256i xmmRow2B0d,
__m256i xmmRow2B0e, __m256i xmmRow2B0f, __m256i xmmRow2B0g, __m256i xmmRow2B0h,
__m256i xmmRow3B0a, __m256i xmmRow3B0b, __m256i xmmRow3B0c, __m256i xmmRow3B0d,
__m256i xmmRow3B0e, __m256i xmmRow3B0f, __m256i xmmRow3B0g, __m256i xmmRow3B0h,
short* B, __m256i* return1, __m256i* return2, __m256i* return3, __m256i* return4);
FORCEINLINE static void kernelsse8x1(__m128i xmmRow0,
short* B, __m128i* return1);
FORCEINLINE static void kernelavx16x1(__m256i xmmRow0B0a,
short* B, __m256i* return1 );
FORCEINLINE static void kernelavx32x1(
__m256i xmmRow0B0a, __m256i xmmRow0B0b,
short* B, __m256i* return1);
FORCEINLINE static void kernelavx64x1(
__m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
short* B, __m256i* return1) ;
FORCEINLINE static void kernelavx128x1(
__m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
__m256i xmmRow0B0e, __m256i xmmRow0B0f, __m256i xmmRow0B0g, __m256i xmmRow0B0h,
short* B, __m256i* return1);
//TODO: Should these be refactored somewhere else? Any BlockHandler will need access to these functions.
//Separate class with static functions? Maybe move the Block rewriting functions as well as these to a new
//static class.
static int RowToColOffsetRewrittenB(int col, int kOffset, int blockSize, int origCols);
static int RowToColOffsetRewrittenA(int row, int kOffset, int blockSize, int rowsPerBlock, int origCols);
static void DumpM256(__m256i dumpMe);
public:
typedef __m256i VectorT;
typedef int16_t ScalarAT;
typedef int16_t ScalarBT;
typedef int32_t ScalarCT;
FORCEINLINE static void HandleBlock8x4(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m128i* resultStorage);
FORCEINLINE static void HandleBlock32x4(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m256i* resultStorage);
FORCEINLINE static void HandleBlock64x4(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m256i* resultStorage);
FORCEINLINE static void HandleBlock128x4(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m256i* resultStorage, VectorT* subtractMe);
FORCEINLINE static void HandleBlock8x1(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m128i* resultStorage);
FORCEINLINE static void HandleBlock16x1(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m256i* resultStorage);
FORCEINLINE static void HandleBlock64x1(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m256i* resultStorage);
FORCEINLINE static void HandleBlock128x1(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m256i* resultStorage, VectorT* subtractMe);
FORCEINLINE static void HandleBlock16x4(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m256i* resultStorage);
//FORCEINLINE static void HandleBlock128x4(int currBlock, int startRow, int m, int k, int n, short* newA, short* B,
FORCEINLINE static void HandleBlock32x1(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m256i* resultStorage);
static VectorT* PrepareExtraB(const ScalarBT* /*prepareMe*/, int /*k*/, int /*n*/)
{
return nullptr;
}
static void FreePreparedB(VectorT* freeMe) { freeMe; assert(nullptr == freeMe); }
};
#define LOADAVX2_128x4 \
__m256i r0b0a2 = _mm256_load_si256((__m256i*)currA2); \
__m256i r0b0b2 = _mm256_load_si256((__m256i*)(currA2 + 16)); \
__m256i r0b0c2 = _mm256_load_si256((__m256i*)(currA2 + 32)); \
__m256i r0b0d2 = _mm256_load_si256((__m256i*)(currA2 + 48)); \
__m256i r0b0e2 = _mm256_load_si256((__m256i*)(currA2 + 64)); \
__m256i r0b0f2 = _mm256_load_si256((__m256i*)(currA2 + 80)); \
__m256i r0b0g2 = _mm256_load_si256((__m256i*)(currA2 + 96)); \
__m256i r0b0h2 = _mm256_load_si256((__m256i*)(currA2 + 112));\
\
__m256i r1b0a2 = _mm256_load_si256((__m256i*)(currA2 + 128));\
__m256i r1b0b2 = _mm256_load_si256((__m256i*)(currA2 + 144));\
__m256i r1b0c2 = _mm256_load_si256((__m256i*)(currA2 + 160));\
__m256i r1b0d2 = _mm256_load_si256((__m256i*)(currA2 + 176));\
__m256i r1b0e2 = _mm256_load_si256((__m256i*)(currA2 + 192));\
__m256i r1b0f2 = _mm256_load_si256((__m256i*)(currA2 + 208));\
__m256i r1b0g2 = _mm256_load_si256((__m256i*)(currA2 + 224));\
__m256i r1b0h2 = _mm256_load_si256((__m256i*)(currA2 + 240));\
\
__m256i r2b0a2 = _mm256_load_si256((__m256i*)(currA2 + 256));\
__m256i r2b0b2 = _mm256_load_si256((__m256i*)(currA2 + 272));\
__m256i r2b0c2 = _mm256_load_si256((__m256i*)(currA2 + 288));\
__m256i r2b0d2 = _mm256_load_si256((__m256i*)(currA2 + 304));\
__m256i r2b0e2 = _mm256_load_si256((__m256i*)(currA2 + 320));\
__m256i r2b0f2 = _mm256_load_si256((__m256i*)(currA2 + 336));\
__m256i r2b0g2 = _mm256_load_si256((__m256i*)(currA2 + 352));\
__m256i r2b0h2 = _mm256_load_si256((__m256i*)(currA2 + 368));\
\
__m256i r3b0a2 = _mm256_load_si256((__m256i*)(currA2 + 384));\
__m256i r3b0b2 = _mm256_load_si256((__m256i*)(currA2 + 400));\
__m256i r3b0c2 = _mm256_load_si256((__m256i*)(currA2 + 416));\
__m256i r3b0d2 = _mm256_load_si256((__m256i*)(currA2 + 432));\
__m256i r3b0e2 = _mm256_load_si256((__m256i*)(currA2 + 448));\
__m256i r3b0f2 = _mm256_load_si256((__m256i*)(currA2 + 464));\
__m256i r3b0g2 = _mm256_load_si256((__m256i*)(currA2 + 480));\
__m256i r3b0h2 = _mm256_load_si256((__m256i*)(currA2 + 496));\
#define LOADAVX2_128x1 \
__m256i r0b0a2 = _mm256_load_si256((__m256i*)currA2); \
__m256i r0b0b2 = _mm256_load_si256((__m256i*)(currA2 + 16)); \
__m256i r0b0c2 = _mm256_load_si256((__m256i*)(currA2 + 32)); \
__m256i r0b0d2 = _mm256_load_si256((__m256i*)(currA2 + 48)); \
__m256i r0b0e2 = _mm256_load_si256((__m256i*)(currA2 + 64)); \
__m256i r0b0f2 = _mm256_load_si256((__m256i*)(currA2 + 80)); \
__m256i r0b0g2 = _mm256_load_si256((__m256i*)(currA2 + 96)); \
__m256i r0b0h2 = _mm256_load_si256((__m256i*)(currA2 + 112));
#define LOADAVX_128x1 \
__m256i r0b0a = _mm256_load_si256((__m256i*)currA); \
__m256i r0b0b = _mm256_load_si256((__m256i*)(currA + 16)); \
__m256i r0b0c = _mm256_load_si256((__m256i*)(currA + 32)); \
__m256i r0b0d = _mm256_load_si256((__m256i*)(currA + 48)); \
__m256i r0b0e = _mm256_load_si256((__m256i*)(currA + 64)); \
__m256i r0b0f = _mm256_load_si256((__m256i*)(currA + 80)); \
__m256i r0b0g = _mm256_load_si256((__m256i*)(currA + 96)); \
__m256i r0b0h = _mm256_load_si256((__m256i*)(currA + 112));
#define LOADAVX_128x4 \
__m256i r0b0a = _mm256_load_si256((__m256i*)currA); \
__m256i r0b0b = _mm256_load_si256((__m256i*)(currA + 16)); \
__m256i r0b0c = _mm256_load_si256((__m256i*)(currA + 32)); \
__m256i r0b0d = _mm256_load_si256((__m256i*)(currA + 48)); \
__m256i r0b0e = _mm256_load_si256((__m256i*)(currA + 64)); \
__m256i r0b0f = _mm256_load_si256((__m256i*)(currA + 80)); \
__m256i r0b0g = _mm256_load_si256((__m256i*)(currA + 96)); \
__m256i r0b0h = _mm256_load_si256((__m256i*)(currA + 112));\
\
__m256i r1b0a = _mm256_load_si256((__m256i*)(currA + 128));\
__m256i r1b0b = _mm256_load_si256((__m256i*)(currA + 144));\
__m256i r1b0c = _mm256_load_si256((__m256i*)(currA + 160));\
__m256i r1b0d = _mm256_load_si256((__m256i*)(currA + 176));\
__m256i r1b0e = _mm256_load_si256((__m256i*)(currA + 192));\
__m256i r1b0f = _mm256_load_si256((__m256i*)(currA + 208));\
__m256i r1b0g = _mm256_load_si256((__m256i*)(currA + 224));\
__m256i r1b0h = _mm256_load_si256((__m256i*)(currA + 240));\
\
__m256i r2b0a = _mm256_load_si256((__m256i*)(currA + 256));\
__m256i r2b0b = _mm256_load_si256((__m256i*)(currA + 272));\
__m256i r2b0c = _mm256_load_si256((__m256i*)(currA + 288));\
__m256i r2b0d = _mm256_load_si256((__m256i*)(currA + 304));\
__m256i r2b0e = _mm256_load_si256((__m256i*)(currA + 320));\
__m256i r2b0f = _mm256_load_si256((__m256i*)(currA + 336));\
__m256i r2b0g = _mm256_load_si256((__m256i*)(currA + 352));\
__m256i r2b0h = _mm256_load_si256((__m256i*)(currA + 368));\
\
__m256i r3b0a = _mm256_load_si256((__m256i*)(currA + 384));\
__m256i r3b0b = _mm256_load_si256((__m256i*)(currA + 400));\
__m256i r3b0c = _mm256_load_si256((__m256i*)(currA + 416));\
__m256i r3b0d = _mm256_load_si256((__m256i*)(currA + 432));\
__m256i r3b0e = _mm256_load_si256((__m256i*)(currA + 448));\
__m256i r3b0f = _mm256_load_si256((__m256i*)(currA + 464));\
__m256i r3b0g = _mm256_load_si256((__m256i*)(currA + 480));\
__m256i r3b0h = _mm256_load_si256((__m256i*)(currA + 496));\
#define LOADAVX_64x4 \
__m256i r0b0a = _mm256_load_si256((__m256i*)currA); \
__m256i r0b0b = _mm256_load_si256((__m256i*)currA + 1); \
__m256i r0b0c = _mm256_load_si256((__m256i*)currA + 2); \
__m256i r0b0d = _mm256_load_si256((__m256i*)currA + 3); \
\
__m256i r1b0a = _mm256_load_si256((__m256i*)currA + 4);\
__m256i r1b0b = _mm256_load_si256((__m256i*)currA + 5);\
__m256i r1b0c = _mm256_load_si256((__m256i*)currA + 6);\
__m256i r1b0d = _mm256_load_si256((__m256i*)currA + 7);\
\
__m256i r2b0a = _mm256_load_si256((__m256i*)currA + 8);\
__m256i r2b0b = _mm256_load_si256((__m256i*)currA + 9);\
__m256i r2b0c = _mm256_load_si256((__m256i*)currA + 10);\
__m256i r2b0d = _mm256_load_si256((__m256i*)currA + 11);\
\
__m256i r3b0a = _mm256_load_si256((__m256i*)currA + 12);\
__m256i r3b0b = _mm256_load_si256((__m256i*)currA + 13);\
__m256i r3b0c = _mm256_load_si256((__m256i*)currA + 14);\
__m256i r3b0d = _mm256_load_si256((__m256i*)currA + 15);
#define LOADAVX_64x1 \
__m256i r0b0a = _mm256_load_si256((__m256i*)currA); \
__m256i r0b0b = _mm256_load_si256((__m256i*)currA + 1); \
__m256i r0b0c = _mm256_load_si256((__m256i*)currA + 2); \
__m256i r0b0d = _mm256_load_si256((__m256i*)currA + 3);
#define LOADAVX_32x4 \
__m256i r0b0a = _mm256_load_si256((__m256i*)currA); \
__m256i r0b0b = _mm256_load_si256((__m256i*)currA + 1); \
\
__m256i r1b0a = _mm256_load_si256((__m256i*)currA + 2);\
__m256i r1b0b = _mm256_load_si256((__m256i*)currA + 3);\
\
__m256i r2b0a = _mm256_load_si256((__m256i*)currA + 4);\
__m256i r2b0b = _mm256_load_si256((__m256i*)currA + 5);\
\
__m256i r3b0a = _mm256_load_si256((__m256i*)currA + 6);\
__m256i r3b0b = _mm256_load_si256((__m256i*)currA + 7);\
#define LOADAVX_32x1 \
__m256i r0b0a = _mm256_load_si256((__m256i*)currA); \
__m256i r0b0b = _mm256_load_si256((__m256i*)currA + 1);
#define LOADAVX_16x4 \
__m256i r0b0a = _mm256_load_si256((__m256i*)currA); \
__m256i r1b0a = _mm256_load_si256((__m256i*)currA + 1);\
__m256i r2b0a = _mm256_load_si256((__m256i*)currA + 2);\
__m256i r3b0a = _mm256_load_si256((__m256i*)currA + 3);\
#define LOADAVX_16x1 \
__m256i r0b0a = _mm256_load_si256((__m256i*)currA);
#define LOAD_8x4 \
__m128i r0b0a = _mm_load_si128((__m128i*)currA);\
__m128i r1b0a = _mm_load_si128((__m128i*)currA + 1);\
__m128i r2b0a = _mm_load_si128((__m128i*)currA + 2);\
__m128i r3b0a = _mm_load_si128((__m128i*)currA + 3);\
#define LOAD_8x1 \
__m128i r0b0a = _mm_load_si128((__m128i*)currA);
FORCEINLINE void BlockHandlerAVX::HandleBlock8x4(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m128i* resultStorage)
{
blockCnt; //warning 4100
int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 8, 4, k);
short* currA = &newA[aOffset];
LOAD_8x4;
for (int c = 0; c < n; ++c)
{
short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 8, n)];
__m128i accum1 = _mm_set_epi32(0, 0, 0, 0);
__m128i accum2 = _mm_set_epi32(0, 0, 0, 0);
__m128i accum3 = _mm_set_epi32(0, 0, 0, 0);
__m128i accum4 = _mm_set_epi32(0, 0, 0, 0);
kernelsse8x4(r0b0a, r1b0a, r2b0a, r3b0a,
currB, &accum1, &accum2, &accum3, &accum4);
resultStorage[RowColToOffset(0, c, n)] = _mm_add_epi32(resultStorage[RowColToOffset(0, c, n)], accum1);
resultStorage[RowColToOffset(1, c, n)] = _mm_add_epi32(resultStorage[RowColToOffset(1, c, n)], accum2);
resultStorage[RowColToOffset(2, c, n)] = _mm_add_epi32(resultStorage[RowColToOffset(2, c, n)], accum3);
resultStorage[RowColToOffset(3, c, n)] = _mm_add_epi32(resultStorage[RowColToOffset(3, c, n)], accum4);
}
}
FORCEINLINE void BlockHandlerAVX::HandleBlock8x1(int currBlock, int startRow, int k, int n, short* newA, short* B,
int /*blockCnt*/, __m128i* resultStorage)
{
int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 8, 4, k);
short* currA = &newA[aOffset];
LOAD_8x1;
for (int c = 0; c < n; ++c)
{
short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 8, n)];
__m128i accum1 = _mm_set_epi32(0, 0, 0, 0);
kernelsse8x1(r0b0a,
currB, &accum1);
resultStorage[RowColToOffset(0, c, n)] = _mm_add_epi32(resultStorage[RowColToOffset(0, c, n)], accum1);
}
}
FORCEINLINE void BlockHandlerAVX::HandleBlock16x4(int currBlock, int startRow, int k, int n, short* newA, short* B,
int /*blockCnt*/, __m256i* resultStorage)
{
int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 16, 4, k);
short* currA = &newA[aOffset];
LOADAVX_16x4;
//#pragma omp parallel for
for (int c = 0; c < n; ++c)
{
short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 16, n)];
//The gain comes when we have all the row values loaded up
//together and we multiply them all times each column, saving m_rowsPerBlock column
//loads.
__m256i accum1 = _mm256_set1_epi16(0);
__m256i accum2 = _mm256_set1_epi16(0);
__m256i accum3 = _mm256_set1_epi16(0);
__m256i accum4 = _mm256_set1_epi16(0);
kernelavx16x4(r0b0a, r1b0a, r2b0a, r3b0a,
currB, &accum1, &accum2, &accum3, &accum4);
resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(0, c, n)], accum1);
resultStorage[RowColToOffset(1, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(1, c, n)], accum2);
resultStorage[RowColToOffset(2, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(2, c, n)], accum3);
resultStorage[RowColToOffset(3, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(3, c, n)], accum4);
}
}
FORCEINLINE void BlockHandlerAVX::HandleBlock16x1(int currBlock, int startRow, int k, int n, short* newA, short* B,
int /*blockCnt*/, __m256i* resultStorage)
{
int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 16, 1, k);
short* currA = &newA[aOffset];
LOADAVX_16x1;
//#pragma omp parallel for
for (int c = 0; c < n; ++c)
{
short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 16, n)];
//The gain comes when we have all the row values loaded up
//together and we multiply them all times each column, saving m_rowsPerBlock column
//loads.
__m256i accum1 = _mm256_set1_epi16(0);
kernelavx16x1(r0b0a, currB, &accum1);
resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(0, c, n)], accum1);
}
}
FORCEINLINE void BlockHandlerAVX::HandleBlock32x4(int currBlock, int startRow, int k, int n, short* newA, short* B,
int /*blockCnt*/, __m256i* resultStorage)
{
int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 32, 4, k);
short* currA = &newA[aOffset];
LOADAVX_32x4;
//#pragma omp parallel for
for (int c = 0; c < n; ++c)
{
short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 32, n)];
//The gain comes when we have all the row values loaded up
//together and we multiply them all times each column, saving m_rowsPerBlock column
//loads.
__m256i accum1 = _mm256_set1_epi16(0);
__m256i accum2 = _mm256_set1_epi16(0);
__m256i accum3 = _mm256_set1_epi16(0);
__m256i accum4 = _mm256_set1_epi16(0);
kernelavx32x4(
r0b0a, r0b0b,
r1b0a, r1b0b,
r2b0a, r2b0b,
r3b0a, r3b0b,
currB, &accum1, &accum2, &accum3, &accum4);
resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(0, c, n)], accum1);
resultStorage[RowColToOffset(1, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(1, c, n)], accum2);
resultStorage[RowColToOffset(2, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(2, c, n)], accum3);
resultStorage[RowColToOffset(3, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(3, c, n)], accum4);
}
}
FORCEINLINE void BlockHandlerAVX::HandleBlock32x1(int currBlock, int startRow, int k, int n, short* newA, short* B,
int /*blockCnt*/, __m256i* resultStorage)
{
int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 32, 1, k);
short* currA = &newA[aOffset];
LOADAVX_32x1;
//#pragma omp parallel for
for (int c = 0; c < n; ++c)
{
short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 32, n)];
__m256i accum1 = _mm256_set1_epi16(0);
kernelavx32x1(
r0b0a, r0b0b, currB, &accum1);
resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(0, c, n)], accum1);
}
}
FORCEINLINE void BlockHandlerAVX::HandleBlock64x4(int currBlock, int startRow, int k, int n, short* newA, short* B,
int /*blockCnt*/, __m256i* resultStorage)
{
int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 64, 4, k);
short* currA = &newA[aOffset];
LOADAVX_64x4;
//#pragma omp parallel for
for (int c = 0; c < n; ++c)
{
short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 64, n)];
//The gain comes when we have all the row values loaded up
//together and we multiply them all times each column, saving m_rowsPerBlock column
//loads.
__m256i accum1 = _mm256_set1_epi16(0);
__m256i accum2 = _mm256_set1_epi16(0);
__m256i accum3 = _mm256_set1_epi16(0);
__m256i accum4 = _mm256_set1_epi16(0);
kernelavx64x4(
r0b0a, r0b0b, r0b0c, r0b0d,
r1b0a, r1b0b, r1b0c, r1b0d,
r2b0a, r2b0b, r2b0c, r2b0d,
r3b0a, r3b0b, r3b0c, r3b0d,
currB, &accum1, &accum2, &accum3, &accum4);
resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(0, c, n)], accum1);
resultStorage[RowColToOffset(1, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(1, c, n)], accum2);
resultStorage[RowColToOffset(2, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(2, c, n)], accum3);
resultStorage[RowColToOffset(3, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(3, c, n)], accum4);
}
}
FORCEINLINE void BlockHandlerAVX::HandleBlock64x1(int currBlock, int startRow, int k, int n, short* newA, short* B,
int /*blockCnt*/, __m256i* resultStorage)
{
int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 64, 4, k);
short* currA = &newA[aOffset];
LOADAVX_64x1;
//#pragma omp parallel for
for (int c = 0; c < n; ++c)
{
short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 64, n)];
//The gain comes when we have all the row values loaded up
//together and we multiply them all times each column, saving m_rowsPerBlock column
//loads.
__m256i accum1 = _mm256_set1_epi16(0);
kernelavx64x1(
r0b0a, r0b0b, r0b0c, r0b0d,
currB, &accum1);
resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(0, c, n)], accum1);
}
}
FORCEINLINE void BlockHandlerAVX::HandleBlock128x4(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m256i* resultStorage, VectorT* /*subtractMe*/)
{
int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 128, 4, k);
int aOffset2 = RowToColOffsetRewrittenA(startRow, currBlock + 1, 128, 4, k);
short* currA = &newA[aOffset];
short* currA2 = &newA[aOffset2];
LOADAVX_128x4;
LOADAVX2_128x4;
//#pragma omp parallel for
for (int c = 0; c < n; ++c)
{
short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 128, n)];
short* currB2 = &B[RowToColOffsetRewrittenB(c, currBlock + 1, 128, n)];
//The gain comes when we have all the row values loaded up
//together and we multiply them all times each column, saving m_rowsPerBlock column
//loads.
__m256i accum1 = _mm256_set1_epi16(0);
__m256i accum2 = _mm256_set1_epi16(0);
__m256i accum3 = _mm256_set1_epi16(0);
__m256i accum4 = _mm256_set1_epi16(0);
__m256i accum5 = _mm256_set1_epi16(0);
__m256i accum6 = _mm256_set1_epi16(0);
__m256i accum7 = _mm256_set1_epi16(0);
__m256i accum8 = _mm256_set1_epi16(0);
kernelavx128x4(
r0b0a, r0b0b, r0b0c, r0b0d, r0b0e, r0b0f, r0b0g, r0b0h,
r1b0a, r1b0b, r1b0c, r1b0d, r1b0e, r1b0f, r1b0g, r1b0h,
r2b0a, r2b0b, r2b0c, r2b0d, r2b0e, r2b0f, r2b0g, r2b0h,
r3b0a, r3b0b, r3b0c, r3b0d, r3b0e, r3b0f, r3b0g, r3b0h,
currB, &accum1, &accum2, &accum3, &accum4);
if (blockCnt > 1)
{
kernelavx128x4(
r0b0a2, r0b0b2, r0b0c2, r0b0d2, r0b0e2, r0b0f2, r0b0g2, r0b0h2,
r1b0a2, r1b0b2, r1b0c2, r1b0d2, r1b0e2, r1b0f2, r1b0g2, r1b0h2,
r2b0a2, r2b0b2, r2b0c2, r2b0d2, r2b0e2, r2b0f2, r2b0g2, r2b0h2,
r3b0a2, r3b0b2, r3b0c2, r3b0d2, r3b0e2, r3b0f2, r3b0g2, r3b0h2,
currB2, &accum5, &accum6, &accum7, &accum8);
}
resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(0, c, n)], _mm256_add_epi32(accum1, accum5));
resultStorage[RowColToOffset(1, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(1, c, n)], _mm256_add_epi32(accum2, accum6));
resultStorage[RowColToOffset(2, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(2, c, n)], _mm256_add_epi32(accum3, accum7));
resultStorage[RowColToOffset(3, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(3, c, n)], _mm256_add_epi32(accum4, accum8));
}
}
FORCEINLINE void BlockHandlerAVX::HandleBlock128x1(int currBlock, int startRow, int k, int n, short* newA, short* B,
int blockCnt, __m256i* resultStorage, VectorT* /*subtractMe*/)
{
int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 128, 4, k);
int aOffset2 = RowToColOffsetRewrittenA(startRow, currBlock + 1, 128, 4, k);
short* currA = &newA[aOffset];
short* currA2 = &newA[aOffset2];
LOADAVX_128x1;
LOADAVX2_128x1;
//#pragma omp parallel for
for (int c = 0; c < n; ++c)
{
short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 128, n)];
short* currB2 = &B[RowToColOffsetRewrittenB(c, currBlock + 1, 128, n)];
//The gain comes when we have all the row values loaded up
//together and we multiply them all times each column, saving m_rowsPerBlock column
//loads.
__m256i accum1 = _mm256_set1_epi16(0);
__m256i accum2 = _mm256_set1_epi16(0);
kernelavx128x1(
r0b0a, r0b0b, r0b0c, r0b0d, r0b0e, r0b0f, r0b0g, r0b0h,
currB, &accum1);
if (blockCnt > 1)
{
kernelavx128x1(
r0b0a2, r0b0b2, r0b0c2, r0b0d2, r0b0e2, r0b0f2, r0b0g2, r0b0h2,
currB2, &accum1);
}
resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(0, c, n)], _mm256_add_epi32(accum1, accum2));
}
}
FORCEINLINE void BlockHandlerAVX::kernelsse8x1(__m128i xmmRow0,
short* B, __m128i* return1)
{
__m128i xmmCol0 = _mm_load_si128((__m128i*)B);
__m128i result1 = _mm_madd_epi16(xmmRow0, xmmCol0);
*return1 = result1;
}
FORCEINLINE void BlockHandlerAVX::kernelsse8x4(__m128i xmmRow0, __m128i xmmRow1, __m128i xmmRow2, __m128i xmmRow3,
short* B, __m128i* return1, __m128i* return2, __m128i* return3, __m128i* return4)
{
__m128i xmmCol0 = _mm_load_si128((__m128i*)B);
__m128i result1 = _mm_madd_epi16(xmmRow0, xmmCol0);
__m128i result2 = _mm_madd_epi16(xmmRow1, xmmCol0);
__m128i result3 = _mm_madd_epi16(xmmRow2, xmmCol0);
__m128i result4 = _mm_madd_epi16(xmmRow3, xmmCol0);
*return1 = result1;
*return2 = result2;
*return3 = result3;
*return4 = result4;
}
FORCEINLINE void BlockHandlerAVX::kernelavx16x1(__m256i xmmRow0B0a,
short* B, __m256i* return1)
{
__m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
//Result for row 0
//Nomenclature:
//r0b0axc0b0a means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
__m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
*return1 = r0b0axc0b0a;
}
FORCEINLINE void BlockHandlerAVX::kernelavx16x4(__m256i xmmRow0B0a, __m256i xmmRow1B0a, __m256i xmmRow2B0a, __m256i xmmRow3B0a,
short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4)
{
__m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
//Result for row 0
//Nomenclature:
//r0b0axc0b0a means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
__m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
//Result for row 1
__m256i r1b0axc0b0a = _mm256_madd_epi16(xmmRow1B0a, xmmCol0B0a);
//Result for row 2
__m256i r2b0axc0b0a = _mm256_madd_epi16(xmmRow2B0a, xmmCol0B0a);
//Result for row 3
__m256i r3b0axc0b0a = _mm256_madd_epi16(xmmRow3B0a, xmmCol0B0a);
*return1 = r0b0axc0b0a;
*return2 = r1b0axc0b0a;
*return3 = r2b0axc0b0a;
*return4 = r3b0axc0b0a;
}
FORCEINLINE void BlockHandlerAVX::kernelavx32x1(
__m256i xmmRow0B0a, __m256i xmmRow0B0b,
short* B, __m256i* return1)
{
__m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
__m256i xmmCol0B0b = _mm256_load_si256((__m256i*)B + 1);
//Result for row 0
//Nomenclature:
//r0b0axc0b0a means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
__m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
__m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
__m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
*return1 = result1a;
}
FORCEINLINE void BlockHandlerAVX::kernelavx32x4(
__m256i xmmRow0B0a, __m256i xmmRow0B0b,
__m256i xmmRow1B0a, __m256i xmmRow1B0b,
__m256i xmmRow2B0a, __m256i xmmRow2B0b,
__m256i xmmRow3B0a, __m256i xmmRow3B0b,
short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4)
{
__m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
__m256i xmmCol0B0b = _mm256_load_si256((__m256i*)B + 1);
//Result for row 0
//Nomenclature:
//r0b0axc0b0a means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
__m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
__m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
__m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
//Result for row 1
__m256i r1b0axc0b0a = _mm256_madd_epi16(xmmRow1B0a, xmmCol0B0a);
__m256i r1b0bxc0b0b = _mm256_madd_epi16(xmmRow1B0b, xmmCol0B0b);
__m256i result2a = _mm256_add_epi32(r1b0axc0b0a, r1b0bxc0b0b);
//Result for row 2
__m256i r2b0axc0b0a = _mm256_madd_epi16(xmmRow2B0a, xmmCol0B0a);
__m256i r2b0bxc0b0b = _mm256_madd_epi16(xmmRow2B0b, xmmCol0B0b);
__m256i result3a = _mm256_add_epi32(r2b0axc0b0a, r2b0bxc0b0b);
//Result for row 3
__m256i r3b0axc0b0a = _mm256_madd_epi16(xmmRow3B0a, xmmCol0B0a);
__m256i r3b0bxc0b0b = _mm256_madd_epi16(xmmRow3B0b, xmmCol0B0b);
__m256i result4a = _mm256_add_epi32(r3b0axc0b0a, r3b0bxc0b0b);
*return1 = result1a;
*return2 = result2a;
*return3 = result3a;
*return4 = result4a;
}
FORCEINLINE void BlockHandlerAVX::kernelavx64x1(
__m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
short* B, __m256i* return1)
{
__m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
__m256i xmmCol0B0b = _mm256_load_si256((__m256i*)B + 1);
__m256i xmmCol0B0c = _mm256_load_si256((__m256i*)B + 2);
__m256i xmmCol0B0d = _mm256_load_si256((__m256i*)B + 3);
__m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
__m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
__m256i r0b0cxc0b0c = _mm256_madd_epi16(xmmRow0B0c, xmmCol0B0c);
__m256i r0b0dxc0b0d = _mm256_madd_epi16(xmmRow0B0d, xmmCol0B0d);
__m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
__m256i result1b = _mm256_add_epi32(r0b0cxc0b0c, r0b0dxc0b0d);
__m256i result1ab = _mm256_add_epi32(result1a, result1b);
*return1 = result1ab;
//std::cout << "Returning " << u.i[0] << " + " << u.i[4] << "(" << u.i[0] + u.i[4] << ") for first row" << std::endl;
}
FORCEINLINE void BlockHandlerAVX::kernelavx64x4(
__m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
__m256i xmmRow1B0a, __m256i xmmRow1B0b, __m256i xmmRow1B0c, __m256i xmmRow1B0d,
__m256i xmmRow2B0a, __m256i xmmRow2B0b, __m256i xmmRow2B0c, __m256i xmmRow2B0d,
__m256i xmmRow3B0a, __m256i xmmRow3B0b, __m256i xmmRow3B0c, __m256i xmmRow3B0d,
short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4)
{
__m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
__m256i xmmCol0B0b = _mm256_load_si256((__m256i*)B + 1);
__m256i xmmCol0B0c = _mm256_load_si256((__m256i*)B + 2);
__m256i xmmCol0B0d = _mm256_load_si256((__m256i*)B + 3);
//Result for row 0
//Nomenclature:
//r0b0axc0b0a means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
__m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
__m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
__m256i r0b0cxc0b0c = _mm256_madd_epi16(xmmRow0B0c, xmmCol0B0c);
__m256i r0b0dxc0b0d = _mm256_madd_epi16(xmmRow0B0d, xmmCol0B0d);
__m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
__m256i result1b = _mm256_add_epi32(r0b0cxc0b0c, r0b0dxc0b0d);
__m256i result1ab = _mm256_add_epi32(result1a, result1b);
//Result for row 1
__m256i r1b0axc0b0a = _mm256_madd_epi16(xmmRow1B0a, xmmCol0B0a);
__m256i r1b0bxc0b0b = _mm256_madd_epi16(xmmRow1B0b, xmmCol0B0b);
__m256i r1b0cxc0b0c = _mm256_madd_epi16(xmmRow1B0c, xmmCol0B0c);
__m256i r1b0dxc0b0d = _mm256_madd_epi16(xmmRow1B0d, xmmCol0B0d);
__m256i result2a = _mm256_add_epi32(r1b0axc0b0a, r1b0bxc0b0b);
__m256i result2b = _mm256_add_epi32(r1b0cxc0b0c, r1b0dxc0b0d);
__m256i result2ab = _mm256_add_epi32(result2a, result2b);
//Result for row 2
__m256i r2b0axc0b0a = _mm256_madd_epi16(xmmRow2B0a, xmmCol0B0a);
__m256i r2b0bxc0b0b = _mm256_madd_epi16(xmmRow2B0b, xmmCol0B0b);
__m256i r2b0cxc0b0c = _mm256_madd_epi16(xmmRow2B0c, xmmCol0B0c);
__m256i r2b0dxc0b0d = _mm256_madd_epi16(xmmRow2B0d, xmmCol0B0d);
__m256i result3a = _mm256_add_epi32(r2b0axc0b0a, r2b0bxc0b0b);
__m256i result3b = _mm256_add_epi32(r2b0cxc0b0c, r2b0dxc0b0d);
__m256i result3ab = _mm256_add_epi32(result3a, result3b);
//Result for row 3
__m256i r3b0axc0b0a = _mm256_madd_epi16(xmmRow3B0a, xmmCol0B0a);
__m256i r3b0bxc0b0b = _mm256_madd_epi16(xmmRow3B0b, xmmCol0B0b);
__m256i r3b0cxc0b0c = _mm256_madd_epi16(xmmRow3B0c, xmmCol0B0c);
__m256i r3b0dxc0b0d = _mm256_madd_epi16(xmmRow3B0d, xmmCol0B0d);
__m256i result4a = _mm256_add_epi32(r3b0axc0b0a, r3b0bxc0b0b);
__m256i result4b = _mm256_add_epi32(r3b0cxc0b0c, r3b0dxc0b0d);
__m256i result4ab = _mm256_add_epi32(result4a, result4b);
*return1 = result1ab;
*return2 = result2ab;
*return3 = result3ab;
*return4 = result4ab;
}
FORCEINLINE void BlockHandlerAVX::kernelavx128x1(
__m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
__m256i xmmRow0B0e, __m256i xmmRow0B0f, __m256i xmmRow0B0g, __m256i xmmRow0B0h,
short* B, __m256i* return1)
{
__m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
__m256i xmmCol0B0b = _mm256_load_si256((__m256i*)(B + 16));
__m256i xmmCol0B0c = _mm256_load_si256((__m256i*)(B + 32));
__m256i xmmCol0B0d = _mm256_load_si256((__m256i*)(B + 48));
__m256i xmmCol0B0e = _mm256_load_si256((__m256i*)(B + 64));
__m256i xmmCol0B0f = _mm256_load_si256((__m256i*)(B + 80));
__m256i xmmCol0B0g = _mm256_load_si256((__m256i*)(B + 96));
__m256i xmmCol0B0h = _mm256_load_si256((__m256i*)(B + 112));
//Result for row 0
//Nomenclature:
//r0b0axc0b0a means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
__m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
__m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
__m256i r0b0cxc0b0c = _mm256_madd_epi16(xmmRow0B0c, xmmCol0B0c);
__m256i r0b0dxc0b0d = _mm256_madd_epi16(xmmRow0B0d, xmmCol0B0d);
__m256i r0b0exc0b0e = _mm256_madd_epi16(xmmRow0B0e, xmmCol0B0e);
__m256i r0b0fxc0b0f = _mm256_madd_epi16(xmmRow0B0f, xmmCol0B0f);
__m256i r0b0gxc0b0g = _mm256_madd_epi16(xmmRow0B0g, xmmCol0B0g);
__m256i r0b0hxc0b0h = _mm256_madd_epi16(xmmRow0B0h, xmmCol0B0h);
__m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
__m256i result1b = _mm256_add_epi32(r0b0cxc0b0c, r0b0dxc0b0d);
__m256i result1c = _mm256_add_epi32(r0b0exc0b0e, r0b0fxc0b0f);
__m256i result1d = _mm256_add_epi32(r0b0gxc0b0g, r0b0hxc0b0h);
__m256i result1ab = _mm256_add_epi32(result1a, result1b);
__m256i result1cd = _mm256_add_epi32(result1c, result1d);
__m256i result1abcd = _mm256_add_epi32(result1ab, result1cd);
*return1 = result1abcd;
//std::cout << "Returning " << u.i[0] << " + " << u.i[4] << "(" << u.i[0] + u.i[4] << ") for first row" << std::endl;
}
FORCEINLINE void BlockHandlerAVX::kernelavx128x4(
__m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
__m256i xmmRow0B0e, __m256i xmmRow0B0f, __m256i xmmRow0B0g, __m256i xmmRow0B0h,
__m256i xmmRow1B0a, __m256i xmmRow1B0b, __m256i xmmRow1B0c, __m256i xmmRow1B0d,
__m256i xmmRow1B0e, __m256i xmmRow1B0f, __m256i xmmRow1B0g, __m256i xmmRow1B0h,
__m256i xmmRow2B0a, __m256i xmmRow2B0b, __m256i xmmRow2B0c, __m256i xmmRow2B0d,
__m256i xmmRow2B0e, __m256i xmmRow2B0f, __m256i xmmRow2B0g, __m256i xmmRow2B0h,
__m256i xmmRow3B0a, __m256i xmmRow3B0b, __m256i xmmRow3B0c, __m256i xmmRow3B0d,
__m256i xmmRow3B0e, __m256i xmmRow3B0f, __m256i xmmRow3B0g, __m256i xmmRow3B0h,
short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4)
{
__m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
__m256i xmmCol0B0b = _mm256_load_si256((__m256i*)(B + 16));
__m256i xmmCol0B0c = _mm256_load_si256((__m256i*)(B + 32));
__m256i xmmCol0B0d = _mm256_load_si256((__m256i*)(B + 48));
__m256i xmmCol0B0e = _mm256_load_si256((__m256i*)(B + 64));
__m256i xmmCol0B0f = _mm256_load_si256((__m256i*)(B + 80));
__m256i xmmCol0B0g = _mm256_load_si256((__m256i*)(B + 96));
__m256i xmmCol0B0h = _mm256_load_si256((__m256i*)(B + 112));
//Result for row 0
//Nomenclature:
//r0b0axc0b0a means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
__m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
__m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
__m256i r0b0cxc0b0c = _mm256_madd_epi16(xmmRow0B0c, xmmCol0B0c);
__m256i r0b0dxc0b0d = _mm256_madd_epi16(xmmRow0B0d, xmmCol0B0d);
__m256i r0b0exc0b0e = _mm256_madd_epi16(xmmRow0B0e, xmmCol0B0e);
__m256i r0b0fxc0b0f = _mm256_madd_epi16(xmmRow0B0f, xmmCol0B0f);
__m256i r0b0gxc0b0g = _mm256_madd_epi16(xmmRow0B0g, xmmCol0B0g);
__m256i r0b0hxc0b0h = _mm256_madd_epi16(xmmRow0B0h, xmmCol0B0h);
__m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
__m256i result1b = _mm256_add_epi32(r0b0cxc0b0c, r0b0dxc0b0d);
__m256i result1c = _mm256_add_epi32(r0b0exc0b0e, r0b0fxc0b0f);
__m256i result1d = _mm256_add_epi32(r0b0gxc0b0g, r0b0hxc0b0h);
__m256i result1ab = _mm256_add_epi32(result1a, result1b);
__m256i result1cd = _mm256_add_epi32(result1c, result1d);
__m256i result1abcd = _mm256_add_epi32(result1ab, result1cd);
//Result for row 1
__m256i r1b0axc0b0a = _mm256_madd_epi16(xmmRow1B0a, xmmCol0B0a);
__m256i r1b0bxc0b0b = _mm256_madd_epi16(xmmRow1B0b, xmmCol0B0b);
__m256i r1b0cxc0b0c = _mm256_madd_epi16(xmmRow1B0c, xmmCol0B0c);
__m256i r1b0dxc0b0d = _mm256_madd_epi16(xmmRow1B0d, xmmCol0B0d);
__m256i r1b0exc0b0e = _mm256_madd_epi16(xmmRow1B0e, xmmCol0B0e);
__m256i r1b0fxc0b0f = _mm256_madd_epi16(xmmRow1B0f, xmmCol0B0f);
__m256i r1b0gxc0b0g = _mm256_madd_epi16(xmmRow1B0g, xmmCol0B0g);
__m256i r1b0hxc0b0h = _mm256_madd_epi16(xmmRow1B0h, xmmCol0B0h);
__m256i result2a = _mm256_add_epi32(r1b0axc0b0a, r1b0bxc0b0b);
__m256i result2b = _mm256_add_epi32(r1b0cxc0b0c, r1b0dxc0b0d);
__m256i result2c = _mm256_add_epi32(r1b0exc0b0e, r1b0fxc0b0f);
__m256i result2d = _mm256_add_epi32(r1b0gxc0b0g, r1b0hxc0b0h);
__m256i result2ab = _mm256_add_epi32(result2a, result2b);
__m256i result2cd = _mm256_add_epi32(result2c, result2d);
__m256i result2abcd = _mm256_add_epi32(result2ab, result2cd);
//Result for row 2
__m256i r2b0axc0b0a = _mm256_madd_epi16(xmmRow2B0a, xmmCol0B0a);
__m256i r2b0bxc0b0b = _mm256_madd_epi16(xmmRow2B0b, xmmCol0B0b);
__m256i r2b0cxc0b0c = _mm256_madd_epi16(xmmRow2B0c, xmmCol0B0c);
__m256i r2b0dxc0b0d = _mm256_madd_epi16(xmmRow2B0d, xmmCol0B0d);
__m256i r2b0exc0b0e = _mm256_madd_epi16(xmmRow2B0e, xmmCol0B0e);
__m256i r2b0fxc0b0f = _mm256_madd_epi16(xmmRow2B0f, xmmCol0B0f);
__m256i r2b0gxc0b0g = _mm256_madd_epi16(xmmRow2B0g, xmmCol0B0g);
__m256i r2b0hxc0b0h = _mm256_madd_epi16(xmmRow2B0h, xmmCol0B0h);
__m256i result3a = _mm256_add_epi32(r2b0axc0b0a, r2b0bxc0b0b);
__m256i result3b = _mm256_add_epi32(r2b0cxc0b0c, r2b0dxc0b0d);
__m256i result3c = _mm256_add_epi32(r2b0exc0b0e, r2b0fxc0b0f);
__m256i result3d = _mm256_add_epi32(r2b0gxc0b0g, r2b0hxc0b0h);
__m256i result3ab = _mm256_add_epi32(result3a, result3b);
__m256i result3cd = _mm256_add_epi32(result3c, result3d);
__m256i result3abcd = _mm256_add_epi32(result3ab, result3cd);
//Result for row 3
__m256i r3b0axc0b0a = _mm256_madd_epi16(xmmRow3B0a, xmmCol0B0a);
__m256i r3b0bxc0b0b = _mm256_madd_epi16(xmmRow3B0b, xmmCol0B0b);
__m256i r3b0cxc0b0c = _mm256_madd_epi16(xmmRow3B0c, xmmCol0B0c);
__m256i r3b0dxc0b0d = _mm256_madd_epi16(xmmRow3B0d, xmmCol0B0d);
__m256i r3b0exc0b0e = _mm256_madd_epi16(xmmRow3B0e, xmmCol0B0e);
__m256i r3b0fxc0b0f = _mm256_madd_epi16(xmmRow3B0f, xmmCol0B0f);
__m256i r3b0gxc0b0g = _mm256_madd_epi16(xmmRow3B0g, xmmCol0B0g);
__m256i r3b0hxc0b0h = _mm256_madd_epi16(xmmRow3B0h, xmmCol0B0h);
__m256i result4a = _mm256_add_epi32(r3b0axc0b0a, r3b0bxc0b0b);
__m256i result4b = _mm256_add_epi32(r3b0cxc0b0c, r3b0dxc0b0d);
__m256i result4c = _mm256_add_epi32(r3b0exc0b0e, r3b0fxc0b0f);
__m256i result4d = _mm256_add_epi32(r3b0gxc0b0g, r3b0hxc0b0h);
__m256i result4ab = _mm256_add_epi32(result4a, result4b);
__m256i result4cd = _mm256_add_epi32(result4c, result4d);
__m256i result4abcd = _mm256_add_epi32(result4ab, result4cd);
//Now we can just add horizontally
*return1 = result1abcd;
*return2 = result2abcd;
*return3 = result3abcd;
*return4 = result4abcd;
}
}}}

Просмотреть файл

@ -0,0 +1,32 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
//
#include "stdafx.h"
#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include "BlockHandlerSSE.h"
#include "BlockMultiplierMatrixUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {
int BlockHandlerSSE::RowToColOffsetRewrittenA(int row, int kOffset, int blockSize, int rowsPerBlock, int origCols)
{
int rowIdx = row / rowsPerBlock;
int offsetFromBlockBeginning = row % rowsPerBlock;
int colIdx = kOffset * rowsPerBlock * blockSize + (offsetFromBlockBeginning * blockSize);
return (rowIdx * (origCols / blockSize) * rowsPerBlock * blockSize) + colIdx;
}
//col is the original column of B
//kOffset is the offset to the current block we are multiplying against (in absolute
int BlockHandlerSSE::RowToColOffsetRewrittenB(int col, int kOffset, int blockSize, int origCols)
{
return (origCols * blockSize * kOffset) + (col * blockSize);
}
}}}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,161 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
//
#pragma once
#define NOMINMAX
#include <fstream>
#include <functional>
#include <iostream>
#include <limits>
#include <string.h>//for memset
#include "BlockMultiplierPlatform.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template<typename ScalarT> void DumpMatrix(ScalarT* pDumpMe, int rows, int cols, std::ostream* pStream, int rowMax = std::numeric_limits<int>::max(),
int colMax = std::numeric_limits<int>::max())
{
for (int r = 0; r < std::min(rows, rowMax); ++r)
{
for (int c = 0; c < std::min(cols, colMax); ++c)
{
(*pStream) << pDumpMe[r * cols + c] << " ";
}
(*pStream) << std::endl;
}
}
// Turn a row+col into an absolute offset
FORCEINLINE int RowColToOffset(int idxRow, int idxCol, int numCols)
{
return idxRow * numCols + idxCol;
}
template<typename ScalarT>struct TransposeArgs
{
int r;
ScalarT* transposeMe;
ScalarT* transposed;
int origRows;
int origCols;
};
template<class ScalarT>void TransposeThread(TransposeArgs<ScalarT> ta)
{
for (int c = 0; c < ta.origCols; ++c)
{
//new c,r = old r,c
int oldOffset = RowColToOffset(ta.r, c, ta.origCols);
int newOffset = RowColToOffset(c, ta.r, ta.origRows);
ta.transposed[newOffset] = ta.transposeMe[oldOffset];
}
}
template<typename ScalarT> class TransposeThreadType
{
public:
void operator()(TransposeArgs<ScalarT> ta)
{
TransposeThread<ScalarT>(ta);
}
};
template<class ScalarT> void Transpose(ScalarT* transposeMe, ScalarT* transposed, int origRows, int origCols)
{
#pragma omp parallel for
for (int r = 0; r < origRows; ++r)
{
for (int c = 0; c < origCols; ++c)
{
int oldOffset = RowColToOffset(r, c, origCols);
int newOffset = RowColToOffset(c, r, origRows);
transposed[newOffset] = transposeMe[oldOffset];
}
}
}
template<typename ScalarT> ScalarT* CreateAlignedMatrix(int m, int n, ScalarT initVal, int alignment = 64)
{
ScalarT* ret = (ScalarT*)ALIGNED_ALLOC(sizeof(ScalarT) * (m * n), alignment);
if (initVal != 0)
{
for (int i = 0; i < m * n; ++i)
{
ret[i] = initVal;// +i;
}
}
else
{
memset(ret, 0, sizeof(ScalarT) * m * n);
}
return ret;
}
template<typename ScalarT> void FreeAlignedMatrix(ScalarT* destroyMe)
{
ALIGNED_FREE(destroyMe);
}
template<typename ScalarT> double MeanSquaredError(ScalarT* lhs, ScalarT* rhs, int m, int n)
{
double accumulatedError = 0.0;
for (int r = 0; r < m; ++r)
{
for(int c = 0; c < n; ++c)
{
double err = ((double)lhs[RowColToOffset(r, c, n)] - (double)rhs[RowColToOffset(r, c, n)]);
err = err * err;
accumulatedError += err;
}
}
return accumulatedError / (double)(m * n);
}
template<typename ScalarT> void RandInitIntMatrix(ScalarT* initMe, int m, int n, ScalarT bound)
{
ScalarT* curr = initMe;
for (int i = 0; i < m * n; ++i)
{
*curr++ = rand() % bound;
}
}
//Helper fn for tests
template<typename ScalarT>static void RandInitFloatMatrix(ScalarT* initMe, int m, int n, ScalarT min, ScalarT max)
{
for (int i = 0; i < m * n; ++i)
{
initMe[i] = min + ((max - min) * ((ScalarT)rand() / RAND_MAX));
}
}
//Viewing matrices and troubleshooting is a lot easier in Octave.
//Utility fn for exporting to Octave format
template<typename ScalarT>void DumpMatrixToOctaveFormat(const ScalarT* dumpMe, int rows, int cols, const char* fileName, const char* id)
{
std::ofstream ofs(fileName);
ofs << "# Created by gemmbenchmark" << std::endl <<
"# name: " << id << std::endl <<
"# type: matrix" << std::endl <<
"# rows: " << rows << std::endl <<
"# columns: " << cols << std::endl;
for (int r = 0; r < rows; ++r)
{
for (int c = 0; c < cols; ++c)
{
ofs << ' ' << (ScalarT)(dumpMe[(cols * r) + c]);
}
ofs << std::endl;
}
}
}}} //End namespaces

Просмотреть файл

@ -0,0 +1,19 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
//
#pragma once
#ifdef _MSC_VER
#define ALIGNED_ALLOC(bytes,alignment) _aligned_malloc(bytes,alignment)
#define ALIGNED_FREE(ptr) _aligned_free(ptr)
#define FORCEINLINE __forceinline
#else
#ifdef __GNUC__
#include <stdlib.h>
#define ALIGNED_ALLOC(bytes,alignment) aligned_alloc(alignment,bytes)
#define ALIGNED_FREE(ptr) free(ptr)
//#define FORCEINLINE __attribute__((always_inline))
#define FORCEINLINE inline
#endif
#endif

Просмотреть файл

@ -8,6 +8,13 @@
namespace Microsoft { namespace MSR { namespace CNTK {
#ifndef CPUONLY
inline static void CheckCudaReturnCode(cudaError_t rc, const char* msg)
{
if (rc != cudaSuccess)
RuntimeError("%s: %s (cuda error %d)", msg, cudaGetErrorString(rc), (int)rc);
}
CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int deviceID)
: m_deviceID(deviceID)
{
@ -15,19 +22,18 @@ CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int deviceID)
void* CUDAPageLockedMemAllocator::Malloc(size_t size, int deviceId)
{
void* p;
cudaSetDevice(deviceId);
void* p = nullptr;
CheckCudaReturnCode(cudaSetDevice(deviceId), "Cannot set cuda device");
// Note: I ask for cudaHostAllocDefault but cudaHostGetFlags() shows that it is allocated as 'cudaHostAllocMapped'
cudaHostAlloc(&p, size, cudaHostAllocDefault) || "Malloc in CUDAPageLockedMemAllocator failed";
CheckCudaReturnCode(cudaHostAlloc(&p, size, cudaHostAllocDefault), "Malloc in CUDAPageLockedMemAllocator failed");
return p;
}
void CUDAPageLockedMemAllocator::Free(void* p, int deviceId)
{
cudaSetDevice(deviceId);
cudaFreeHost(p) || "Free in CUDAPageLockedMemAllocator failed";
CheckCudaReturnCode(cudaSetDevice(deviceId), "Cannot set cuda device");
CheckCudaReturnCode(cudaFreeHost(p), "Free in CUDAPageLockedMemAllocator failed");
}
void* CUDAPageLockedMemAllocator::Malloc(size_t size)

Просмотреть файл

@ -4278,11 +4278,16 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
template <class ElemType>
static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE deviceId)
{
// using an array of shared_ptrs because those are thread-safe. The objects themselves are immutable.
// And using a plain array so this will never get freed, avoiding free-after-DLL-unload issues.
static shared_ptr<GPUMatrix<ElemType>> onesCache[32]; // cache of objects
if (deviceId >= _countof(onesCache))
LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", (int) _countof(onesCache), (int) deviceId + 1);
// using a dynamically allocated array so this will never get freed, avoiding free-after-DLL-unload issues.
// and using shared_ptrs since we don't want to leak more than CacheSize elements
// when using a plain array we would have to control lifetime of the object and destructor would be called for every element in the array at the end
const int CacheSize = 32;
static shared_ptr<GPUMatrix<ElemType>> * onesCache = new shared_ptr<GPUMatrix<ElemType>>[CacheSize]; // cache of objects
if (deviceId >= CacheSize){
LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", CacheSize, (int)deviceId + 1);
}
auto p = onesCache[deviceId];
if (!p || p->GetNumRows() < N) // must (re-)allocate
{

Просмотреть файл

@ -5,7 +5,7 @@
// helpful macros
// TODO: the file's name is too general to be included from outside; MathHelpers.h?
//#pragma once
#pragma once
// iterators
#undef foreach_row

Просмотреть файл

@ -161,6 +161,11 @@
<ClInclude Include="..\Common\Include\File.h" />
<ClInclude Include="..\Common\Include\fileutil.h" />
<ClInclude Include="BatchNormalizationEngine.h" />
<ClInclude Include="BlockHandlerAVX.h" />
<ClInclude Include="BlockHandlerSSE.h" />
<ClInclude Include="BlockMultiplier.h" />
<ClInclude Include="BlockMultiplierMatrixUtil.h" />
<ClInclude Include="BlockMultiplierPlatform.h" />
<ClInclude Include="CommonMatrix.h" />
<ClInclude Include="ConvolutionEngine.h" />
<ClInclude Include="ConvolveGeometry.h" />
@ -190,6 +195,8 @@
</ItemGroup>
<ItemGroup>
<ClCompile Include="BatchNormalizationEngine.cpp" />
<ClCompile Include="BlockHandlerAVX.cpp" />
<ClCompile Include="BlockHandlerSSE.cpp" />
<ClCompile Include="ConvolutionEngine.cpp" />
<ClCompile Include="CPURNGHandle.cpp" />
<ClCompile Include="CPUSparseMatrix.cpp" />

Просмотреть файл

@ -42,6 +42,12 @@
<Filter>CPU</Filter>
</ClCompile>
<ClCompile Include="RNGHandle.cpp" />
<ClCompile Include="BlockHandlerAVX.cpp">
<Filter>CPU</Filter>
</ClCompile>
<ClCompile Include="BlockHandlerSSE.cpp">
<Filter>CPU</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="CommonMatrix.h" />
@ -105,6 +111,18 @@
<ClInclude Include="CPURNGHandle.h">
<Filter>CPU</Filter>
</ClInclude>
<ClInclude Include="BlockHandlerAVX.h">
<Filter>CPU</Filter>
</ClInclude>
<ClInclude Include="BlockHandlerSSE.h">
<Filter>CPU</Filter>
</ClInclude>
<ClInclude Include="BlockMultiplier.h">
<Filter>CPU</Filter>
</ClInclude>
<ClInclude Include="BlockMultiplierPlatform.h">
<Filter>CPU</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="GPUMatrix.h">
@ -155,4 +173,4 @@
<UniqueIdentifier>{8f982dac-298d-4e48-b060-8e6cba5ff554}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
</Project>

Просмотреть файл

@ -14,7 +14,9 @@
#pragma warning(push)
#pragma warning(disable : 4251) // needs to have dll-interface to be used by clients of... caused by TensorView::m_shape which is only private. We use the same compiler everywhere.
template<class ElemType> struct TensorTest;
namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
template <class ElemType> struct TensorTest;
}}}}
// This class is exported from the Math.dll.
namespace Microsoft { namespace MSR { namespace CNTK {
@ -151,7 +153,7 @@ private:
const Matrix<ElemType>& GetSOB() const { return *m_sob; }
Matrix<ElemType>& GetSOB() { return *m_sob; }
friend struct ::TensorTest<ElemType>;
friend Test::TensorTest<ElemType>;
// -------------------------------------------------------------------
// sob members

Просмотреть файл

@ -40,10 +40,9 @@ class ondevice
public:
ondevice(size_t deviceid)
{
cudaSetDevice((int) deviceid) || "cudaSetDevice failed!";
}
~ondevice()
{
auto rc = cudaSetDevice((int)deviceid);
if (rc != cudaSuccess)
RuntimeError("Cannot set cuda device: %s (cuda error %d)", cudaGetErrorString(rc), (int)rc);
}
};
} }

Просмотреть файл

@ -110,9 +110,6 @@
<PrecompiledHeader>Create</PrecompiledHeader>
</ClCompile>
</ItemGroup>
<ItemGroup>
<None Include="uci_to_cntk_text_format_converter.py" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>

Просмотреть файл

@ -47,13 +47,5 @@
<Filter Include="Common\Include">
<UniqueIdentifier>{C6F55578-121A-4D7C-8F57-4172BC5C463B}</UniqueIdentifier>
</Filter>
<Filter Include="Scripts">
<UniqueIdentifier>{cd70d891-88aa-40a4-8e47-0e31e4cac48e}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<None Include="uci_to_cntk_text_format_converter.py">
<Filter>Scripts</Filter>
</None>
</ItemGroup>
</Project>
</Project>

Просмотреть файл

@ -16,6 +16,11 @@
namespace Microsoft { namespace MSR { namespace CNTK {
inline bool IsDigit(char c)
{
return '0' <= c && c <= '9';
}
enum State
{
Init = 0,
@ -38,7 +43,7 @@ public:
void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) override;
// A map from sequence ids to the sequence data.
std::map<size_t, SequenceBuffer> m_sequenceMap;
std::vector<SequenceBuffer> m_sequenceMap;
// chunk id (copied from the descriptor)
ChunkIdType m_id;
@ -234,40 +239,11 @@ TextParser<ElemType>::TextDataChunk::TextDataChunk(const ChunkDescriptor& descri
template <class ElemType>
void TextParser<ElemType>::TextDataChunk::GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result)
{
auto it = m_sequenceMap.find(sequenceId);
assert(it != m_sequenceMap.end());
assert(sequenceId < m_sequenceMap.size());
result.reserve(m_parser->m_streamInfos.size());
const auto& sequenceData = it->second;
for (size_t j = 0; j < m_parser->m_streamInfos.size(); ++j)
{
InputStreamBuffer* input = sequenceData[j].get();
const StreamInfo& stream = m_parser->m_streamInfos[j];
SequenceDataPtr data;
if (stream.m_type == StorageType::dense)
{
auto denseData = make_shared<DenseSequenceData>();
denseData->m_sampleLayout = m_parser->m_streams[j]->m_sampleLayout;
data = denseData;
}
else
{
auto sparseData = make_shared<SparseSequenceData>();
SparseInputStreamBuffer* sparseInput = static_cast<SparseInputStreamBuffer*>(input);
sparseData->m_indices = sparseInput->m_indices.data();
sparseData->m_nnzCounts.reserve(sparseInput->m_nnzCounts.size());
copy(sparseInput->m_nnzCounts.begin(), sparseInput->m_nnzCounts.end(),
back_inserter(sparseData->m_nnzCounts));
sparseData->m_totalNnzCount = sparseInput->m_totalNnzCount;
assert(input->m_numberOfSamples == sparseInput->m_nnzCounts.size());
data = sparseData;
}
data->m_data = input->m_buffer.data();
data->m_numberOfSamples = input->m_numberOfSamples;
data->m_chunk = shared_from_this();
data->m_id = sequenceId;
result.push_back(data);
}
const auto& sequenceData = m_sequenceMap[sequenceId];
result.insert(result.end(), sequenceData.begin(), sequenceData.end());
}
template <class ElemType>
@ -292,11 +268,10 @@ ChunkPtr TextParser<ElemType>::GetChunk(ChunkIdType chunkId)
template <class ElemType>
void TextParser<ElemType>::LoadChunk(TextChunkPtr& chunk, const ChunkDescriptor& descriptor)
{
chunk->m_sequenceMap.resize(descriptor.m_sequences.size());
for (const auto& sequenceDescriptor : descriptor.m_sequences)
{
chunk->m_sequenceMap.insert(make_pair(
sequenceDescriptor.m_id,
LoadSequence(sequenceDescriptor)));
chunk->m_sequenceMap[sequenceDescriptor.m_id] = LoadSequence(sequenceDescriptor);
}
}
@ -480,13 +455,39 @@ typename TextParser<ElemType>::SequenceBuffer TextParser<ElemType>::LoadSequence
GetSequenceKey(sequenceDsc).c_str(), GetFileInfo().c_str(), numRowsRead, expectedRowCount);
}
FillSequenceMetadata(sequence, sequenceDsc.m_id);
return sequence;
}
template<class ElemType>
void TextParser<ElemType>::FillSequenceMetadata(SequenceBuffer& sequenceData, size_t sequenceId)
{
for (size_t j = 0; j < m_streamInfos.size(); ++j)
{
const StreamInfo& stream = m_streamInfos[j];
SequenceDataBase* data = sequenceData[j].get();
if (stream.m_type == StorageType::dense)
{
auto denseData = static_cast<DenseInputStreamBuffer*>(data);
denseData->m_sampleLayout = m_streams[j]->m_sampleLayout;
data->m_data = denseData->m_buffer.data();
}
else
{
auto sparseData = static_cast<SparseInputStreamBuffer*>(data);
sparseData->m_indices = sparseData->m_indicesBuffer.data();
assert(data->m_numberOfSamples == sparseData->m_nnzCounts.size());
data->m_data = sparseData->m_buffer.data();
}
data->m_id = sequenceId;
}
}
template <class ElemType>
bool TextParser<ElemType>::TryReadRow(SequenceBuffer& sequence, size_t& bytesToRead)
{
while (bytesToRead && CanRead() && isdigit(*m_pos))
while (bytesToRead && CanRead() && IsDigit(*m_pos))
{
// skip sequence ids
++m_pos;
@ -616,7 +617,7 @@ bool TextParser<ElemType>::TryReadSample(SequenceBuffer& sequence, size_t& bytes
{
SparseInputStreamBuffer* data = reinterpret_cast<SparseInputStreamBuffer*>(sequence[id].get());
vector<ElemType>& values = data->m_buffer;
vector<IndexType>& indices = data->m_indices;
vector<IndexType>& indices = data->m_indicesBuffer;
assert(values.size() == indices.size());
size_t size = values.size();
if (!TryReadSparseSample(values, indices, stream.m_sampleDimension, bytesToRead))
@ -919,7 +920,7 @@ bool TextParser<ElemType>::TryReadUint64(size_t& value, size_t& bytesToRead)
{
char c = *m_pos;
if (!isdigit(c))
if (!IsDigit(c))
{
return found;
}
@ -977,7 +978,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
{
case State::Init:
// the number must either start with a number or a sign
if (isdigit(c))
if (IsDigit(c))
{
state = IntegralPart;
number = (c - '0');
@ -1001,7 +1002,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
break;
case Sign:
// the sign must be followed by a number
if (isdigit(c))
if (IsDigit(c))
{
state = IntegralPart;
number = (c - '0');
@ -1019,7 +1020,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
}
break;
case IntegralPart:
if (isdigit(c))
if (IsDigit(c))
{
number = number * 10 + (c - '0');
}
@ -1040,7 +1041,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
}
break;
case Period:
if (isdigit(c))
if (IsDigit(c))
{
state = FractionalPart;
coefficient = number;
@ -1054,7 +1055,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
}
break;
case FractionalPart:
if (isdigit(c))
if (IsDigit(c))
{
// TODO: ignore if number of precision digits > FLT_[MANT_]DIG/DBL_[MANT_]DIG
// no state change
@ -1079,7 +1080,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
break;
case TheLetterE:
// followed with optional minus or plus sign and nonempty sequence of decimal digits
if (isdigit(c))
if (IsDigit(c))
{
state = Exponent;
negative = false;
@ -1104,7 +1105,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
break;
case ExponentSign:
// exponent sign must be followed by a number
if (isdigit(c))
if (IsDigit(c))
{
state = Exponent;
number = (c - '0');
@ -1122,7 +1123,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
}
break;
case Exponent:
if (isdigit(c))
if (IsDigit(c))
{
// no state change
number = number * 10 + (c - '0');

Просмотреть файл

@ -42,37 +42,33 @@ private:
// Builds an index of the input data.
void Initialize();
// A buffer to keep data for all samples in a (variable length) sequence
// from a single input stream.
struct InputStreamBuffer
{
virtual ~InputStreamBuffer() { };
uint32_t m_numberOfSamples = 0;
std::vector<ElemType> m_buffer;
};
struct DenseInputStreamBuffer : InputStreamBuffer
struct DenseInputStreamBuffer : DenseSequenceData
{
// capacity = expected number of samples * sample size
DenseInputStreamBuffer(size_t capacity)
{
InputStreamBuffer::m_buffer.reserve(capacity);
m_buffer.reserve(capacity);
}
std::vector<ElemType> m_buffer;
};
// In case of sparse input, we also need a vector of
// indices (one index for each input value) and a vector
// of NNZ counts (one for each sample).
struct SparseInputStreamBuffer : InputStreamBuffer
struct SparseInputStreamBuffer : SparseSequenceData
{
IndexType m_totalNnzCount = 0;
std::vector<IndexType> m_indices;
std::vector<IndexType> m_nnzCounts;
SparseInputStreamBuffer()
{
m_totalNnzCount = 0;
}
std::vector<IndexType> m_indicesBuffer;
std::vector<ElemType> m_buffer;
};
// A sequence buffer is a vector that contains an input buffer for each input stream.
typedef std::vector<std::unique_ptr<InputStreamBuffer>> SequenceBuffer;
// A sequence buffer is a vector that contains sequence data for each input stream.
typedef std::vector<SequenceDataPtr> SequenceBuffer;
// A chunk of input data in the text format.
class TextDataChunk;
@ -176,6 +172,9 @@ private:
TextParser(CorpusDescriptorPtr corpus, const std::wstring& filename, const vector<StreamDescriptor>& streams);
// Fills some metadata members to be conformant to the exposed SequenceData interface.
void FillSequenceMetadata(SequenceBuffer& sequenceBuffer, size_t sequenceId);
void SetTraceLevel(unsigned int traceLevel);
void SetMaxAllowedErrors(unsigned int maxErrors);

Просмотреть файл

@ -18,7 +18,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const char ESCAPE_SYMBOL = '#';
const auto BUFFER_SIZE = 256 * 1024;
const auto BUFFER_SIZE = 2 * 1024 * 1024;
inline bool isPrintable(char c)
{

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше