Merge branch 'master' of https://github.com/Microsoft/cntk
This commit is contained in:
Коммит
9db12ddf3d
|
@ -6,6 +6,7 @@ Dockerfile-GPU text
|
|||
*.counts text
|
||||
*.labels text
|
||||
*.feats text
|
||||
*.ctf text
|
||||
*.post text
|
||||
*.cpu text
|
||||
*.gpu text
|
||||
|
@ -19,6 +20,7 @@ Dockerfile-GPU text
|
|||
|
||||
*.md text
|
||||
*.txt text
|
||||
*.TXT text
|
||||
*.html text
|
||||
*.lyx text
|
||||
*.bib text
|
||||
|
@ -44,6 +46,9 @@ make_binary_drop_linux text eol=lf
|
|||
Tests/EndToEndTests/Examples/Speech/TIMIT/WriteBottleneck/expected_output_md5sum.*.txt eol=lf
|
||||
Tests/EndToEndTests/Examples/Speech/TIMIT/WriteScaledLogLike/expected_output_md5sum.*.txt eol=lf
|
||||
|
||||
# Used by reader unit test, needs to keep LF line endings.
|
||||
Tests/UnitTests/ReaderTests/Data/CNTKTextFormatReader/invalid_inputs.txt eol=lf
|
||||
|
||||
Makefile text
|
||||
*.sln text
|
||||
*.vcxproj text
|
||||
|
@ -106,6 +111,10 @@ TIMIT*.statelist text
|
|||
TIMIT*.tfsa text
|
||||
TIMIT*.transitions text
|
||||
|
||||
Examples/Text/ATIS/data/ATIS.* text
|
||||
|
||||
Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b* text
|
||||
|
||||
# Binary extensions:
|
||||
*.vsdm binary
|
||||
*.pdf binary
|
||||
|
|
|
@ -65,6 +65,7 @@ ipch/
|
|||
*.opensdf
|
||||
*.sdf
|
||||
*.cachefile
|
||||
*.userosscache
|
||||
|
||||
# Visual Studio profiler
|
||||
*.psess
|
||||
|
|
17
CNTK.sln
17
CNTK.sln
|
@ -934,7 +934,7 @@ EndProject
|
|||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Data", "Data", "{D11F76CC-DB6D-4CB4-B3B7-AB139DE2F5FA}"
|
||||
ProjectSection(SolutionItems) = preProject
|
||||
Tests\EndToEndTests\Text\SequenceClassification\Data\embeddingmatrix.txt = Tests\EndToEndTests\Text\SequenceClassification\Data\embeddingmatrix.txt
|
||||
Tests\EndToEndTests\Text\SequenceClassification\Data\Train.txt = Tests\EndToEndTests\Text\SequenceClassification\Data\Train.txt
|
||||
Tests\EndToEndTests\Text\SequenceClassification\Data\Train.ctf = Tests\EndToEndTests\Text\SequenceClassification\Data\Train.ctf
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{181664AC-4C95-4798-A923-09B879215B33}"
|
||||
|
@ -1120,6 +1120,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKv2LibraryDll", "Source\
|
|||
{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
|
||||
{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
|
||||
{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
|
||||
{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
|
||||
{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
|
||||
{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}
|
||||
EndProjectSection
|
||||
|
@ -1147,6 +1148,11 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClient", "Examples\E
|
|||
{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BrainScriptTests", "Tests\UnitTests\BrainScriptTests\BrainScriptTests.vcxproj", "{9F999212-AFC5-4EAC-AA78-F7247D46C456}"
|
||||
ProjectSection(ProjectDependencies) = postProject
|
||||
{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug_CpuOnly|x64 = Debug_CpuOnly|x64
|
||||
|
@ -1425,6 +1431,14 @@ Global
|
|||
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
|
||||
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release|x64.ActiveCfg = Release|x64
|
||||
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release|x64.Build.0 = Release|x64
|
||||
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
|
||||
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
|
||||
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug|x64.Build.0 = Debug|x64
|
||||
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
|
||||
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
|
||||
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release|x64.ActiveCfg = Release|x64
|
||||
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
@ -1583,5 +1597,6 @@ Global
|
|||
{3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA} = {47755F2E-D674-4175-9E38-8EA053455072}
|
||||
{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
|
||||
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
|
||||
{9F999212-AFC5-4EAC-AA78-F7247D46C456} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
|
||||
EndGlobalSection
|
||||
EndGlobal
|
||||
|
|
|
@ -863,38 +863,27 @@ The dimension reduced matrix consisting of the maximum value within each pooling
|
|||
|
||||
This function is often associated with Convolution() operations.
|
||||
|
||||
### Delay
|
||||
### PastValue, FutureValue
|
||||
|
||||
Delay node used in recurrent networks, allows creation of a loop in the convolutional network that will repeat a specified number of times.
|
||||
PastValue and FutureValue nodes are used in recurrent networks, allow creation of a loop in the convolutional network that will repeat a specified number of times. PastValue retrieves the value of a node several steps away in the past, while FutureValue retrieves the value of a node from future.
|
||||
|
||||
`Delay(rows, [cols], delayNode, delayTime=1, needGradient=true, defaultHiddenActivity=0.1)`
|
||||
`PastValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`
|
||||
`FutureValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`
|
||||
|
||||
#### Parameters
|
||||
|
||||
`cvweight` – convolution weight matrix, it has the dimensions of \[outputChannels, kernelWidth \* kernelHeight \* inputChannels\]
|
||||
`rows` – number of rows in the node
|
||||
|
||||
`kernelWidth` – width of the kernel
|
||||
`cols` – number of cols in the node. This value is often ommit since the length of a sequence varies
|
||||
|
||||
`kernelHeight` – height of the kernel
|
||||
`timeStep` – \[default = 1\] number of time steps toward the past and future
|
||||
|
||||
`outputChannels` – number of output channels
|
||||
|
||||
`horizontalSubsample` – subsamples in the horizontal direction
|
||||
|
||||
`verticalSubsample` – subsamples in the vertical direction
|
||||
|
||||
#### Optional Parameters
|
||||
|
||||
`delayTime` – \[default = 1\] the amount of delay that will be introduced (number of times the loop will happen)
|
||||
|
||||
`needGradient` – \[default = true\] does the gradient need to be computed for this node
|
||||
|
||||
`defaultHiddenActivity` – \[default = 0.1\] the numerical amount for the defaultHiddenActivity
|
||||
`defaultHiddenActivity` – \[default = 0.1\] default value to use when passing the sequence bounday or when the value is missing.
|
||||
|
||||
#### Returns
|
||||
|
||||
The results of the completed Delay loop
|
||||
Either the past or future value of a node
|
||||
|
||||
#### Notes
|
||||
|
||||
This node is used in recurrent networks, where a delay is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network that will repeat delayTime number of iterations.
|
||||
This node is used in recurrent networks, where a past value is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network.
|
||||
|
|
|
@ -37,19 +37,22 @@ int main(int argc, char* argv[])
|
|||
std::string app = argv[0];
|
||||
std::string path;
|
||||
IEvaluateModel<float> *model;
|
||||
size_t pos;
|
||||
|
||||
#ifdef _WIN32
|
||||
path = app.substr(0, app.rfind("\\"));
|
||||
pos = app.rfind("\\");
|
||||
path = (pos == std::string::npos) ? "." : app.substr(0, pos);
|
||||
|
||||
// This relative path assumes launching from CNTK's binary folder, e.g. x64\Release
|
||||
const std::string modelWorkingDirectory = path + "/../../Examples/Image/MNIST/Data/";
|
||||
#else // on Linux
|
||||
path = app.substr(0, app.rfind("/"));
|
||||
pos = app.rfind("/");
|
||||
path = (pos == std::string::npos) ? "." : app.substr(0, pos);
|
||||
|
||||
// This relative path assumes launching from CNTK's binary folder, e.g. build/release/bin/
|
||||
const std::string modelWorkingDirectory = path + "/../../../Examples/Image/MNIST/Data/";
|
||||
#endif
|
||||
|
||||
|
||||
GetEvalF(&model);
|
||||
|
||||
const std::string modelFilePath = modelWorkingDirectory + "../Output/Models/01_OneHidden";
|
||||
|
|
|
@ -110,9 +110,15 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
|
|||
public static bool Evaluate(string record)
|
||||
{
|
||||
var model = Models.Take();
|
||||
var outcome = model.EvaluateRecord(record);
|
||||
Models.Add(model);
|
||||
return outcome;
|
||||
try
|
||||
{
|
||||
var outcome = model.EvaluateRecord(record);
|
||||
return outcome;
|
||||
}
|
||||
finally
|
||||
{
|
||||
Models.Add(model);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -123,9 +129,15 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
|
|||
public static List<float> Evaluate(List<float> inputs)
|
||||
{
|
||||
var model = Models.Take();
|
||||
var outcome = model.EvaluateInput(inputs);
|
||||
Models.Add(model);
|
||||
return outcome;
|
||||
try
|
||||
{
|
||||
var outcome = model.EvaluateInput(inputs);
|
||||
return outcome;
|
||||
}
|
||||
finally
|
||||
{
|
||||
Models.Add(model);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
|
@ -1,69 +1,69 @@
|
|||
'
|
||||
</s>
|
||||
<s/>
|
||||
<s>
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
F
|
||||
G
|
||||
H
|
||||
I
|
||||
J
|
||||
K
|
||||
L
|
||||
M
|
||||
N
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
T
|
||||
U
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
~AA
|
||||
~AE
|
||||
~AH
|
||||
~AO
|
||||
~AW
|
||||
~AY
|
||||
~B
|
||||
~CH
|
||||
~D
|
||||
~DH
|
||||
~EH
|
||||
~ER
|
||||
~EY
|
||||
~F
|
||||
~G
|
||||
~HH
|
||||
~IH
|
||||
~IY
|
||||
~JH
|
||||
~K
|
||||
~L
|
||||
~M
|
||||
~N
|
||||
~NG
|
||||
~OW
|
||||
~OY
|
||||
~P
|
||||
~R
|
||||
~S
|
||||
~SH
|
||||
~T
|
||||
~TH
|
||||
~UH
|
||||
~UW
|
||||
~V
|
||||
~W
|
||||
~Y
|
||||
~Z
|
||||
~ZH
|
||||
'
|
||||
</s>
|
||||
<s/>
|
||||
<s>
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
F
|
||||
G
|
||||
H
|
||||
I
|
||||
J
|
||||
K
|
||||
L
|
||||
M
|
||||
N
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
T
|
||||
U
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
~AA
|
||||
~AE
|
||||
~AH
|
||||
~AO
|
||||
~AW
|
||||
~AY
|
||||
~B
|
||||
~CH
|
||||
~D
|
||||
~DH
|
||||
~EH
|
||||
~ER
|
||||
~EY
|
||||
~F
|
||||
~G
|
||||
~HH
|
||||
~IH
|
||||
~IY
|
||||
~JH
|
||||
~K
|
||||
~L
|
||||
~M
|
||||
~N
|
||||
~NG
|
||||
~OW
|
||||
~OY
|
||||
~P
|
||||
~R
|
||||
~S
|
||||
~SH
|
||||
~T
|
||||
~TH
|
||||
~UH
|
||||
~UW
|
||||
~V
|
||||
~W
|
||||
~Y
|
||||
~Z
|
||||
~ZH
|
||||
|
|
|
@ -18,9 +18,9 @@ ndlMacroDefine = [
|
|||
]
|
||||
|
||||
LSTMPComponent(inputDim, outputDim, cellDim, inputx, cellDimX2, cellDimX3, cellDimX4) = [
|
||||
wx = Parameter(cellDimX4, inputDim, init="uniform", initValueScale=1);
|
||||
b = Parameter(cellDimX4, 1, init="fixedValue", value=0.0);
|
||||
Wh = Parameter(cellDimX4, outputDim, init="uniform", initValueScale=1);
|
||||
wx = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);
|
||||
b = Parameter(cellDimX4, 1, init="fixedValue", value=0.0);
|
||||
Wh = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);
|
||||
|
||||
Wci = Parameter(cellDim, init="uniform", initValueScale=1);
|
||||
Wcf = Parameter(cellDim, init="uniform", initValueScale=1);
|
||||
|
@ -63,9 +63,9 @@ ndlMacroDefine = [
|
|||
]
|
||||
|
||||
LSTMPComponentBetter(inputDim, outputDim, cellDim, inputx, cellDimX2, cellDimX3, cellDimX4) = [
|
||||
wx = Parameter(cellDimX4, inputDim, init="uniform", initValueScale=1);
|
||||
b = Parameter(cellDimX4, 1, init="fixedValue", value=0.0);
|
||||
Wh = Parameter(cellDimX4, outputDim, init="uniform", initValueScale=1);
|
||||
wx = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);
|
||||
b = Parameter(cellDimX4, 1, init="fixedValue", value=0.0);
|
||||
Wh = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);
|
||||
|
||||
Wci = Parameter(cellDim, init="uniform", initValueScale=1);
|
||||
Wcf = Parameter(cellDim, init="uniform", initValueScale=1);
|
||||
|
@ -112,26 +112,26 @@ ndlMacroDefine = [
|
|||
]
|
||||
|
||||
LSTMPComponentNaive(inputDim, outputDim, cellDim, inputx) = [
|
||||
Wxo = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
|
||||
Wxi = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
|
||||
Wxf = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
|
||||
Wxc = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
|
||||
Wxo = Parameter(cellDim, 0, init="uniform", initValueScale=1);
|
||||
Wxi = Parameter(cellDim, 0, init="uniform", initValueScale=1);
|
||||
Wxf = Parameter(cellDim, 0, init="uniform", initValueScale=1);
|
||||
Wxc = Parameter(cellDim, 0, init="uniform", initValueScale=1);
|
||||
|
||||
bo = Parameter(cellDim, init="fixedValue", value=0.0);
|
||||
bc = Parameter(cellDim, init="fixedValue", value=0.0);
|
||||
bi = Parameter(cellDim, init="fixedValue", value=0.0);
|
||||
bf = Parameter(cellDim, init="fixedValue", value=0.0);
|
||||
|
||||
Whi = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
|
||||
Whi = Parameter(cellDim, 0, init="uniform", initValueScale=1);
|
||||
|
||||
Wci = Parameter(cellDim, init="uniform", initValueScale=1);
|
||||
|
||||
|
||||
Whf = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
|
||||
Whf = Parameter(cellDim, 0, init="uniform", initValueScale=1);
|
||||
Wcf = Parameter(cellDim, init="uniform", initValueScale=1);
|
||||
Who = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
|
||||
Who = Parameter(cellDim, 0, init="uniform", initValueScale=1);
|
||||
Wco = Parameter(cellDim, init="uniform", initValueScale=1);
|
||||
Whc = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
|
||||
Whc = Parameter(cellDim, 0, init="uniform", initValueScale=1);
|
||||
|
||||
dh = PastValue(outputDim, output, timeStep=1);
|
||||
dc = PastValue(cellDim, ct, timeStep=1);
|
||||
|
@ -194,8 +194,8 @@ ndlCreateNetwork_LSTMP_c1024_p256_x3 = [
|
|||
# layer 3
|
||||
LSTMoutput3 = LSTMPComponent(hiddenDim, hiddenDim, cellDim, LSTMoutput2, cellDimX2, cellDimX3, cellDimX4);
|
||||
|
||||
W = Parameter(labelDim, hiddenDim, init="uniform", initValueScale=1);
|
||||
b = Parameter(labelDim, 1, init="fixedValue", value=0);
|
||||
W = Parameter(labelDim, 0, init="uniform", initValueScale=1);
|
||||
b = Parameter(labelDim, 1, init="fixedValue", value=0);
|
||||
LSTMoutputW = Plus(Times(W, LSTMoutput3), b);
|
||||
|
||||
ce = CrossEntropyWithSoftmax(labels, LSTMoutputW);
|
||||
|
|
203
Makefile
203
Makefile
|
@ -17,8 +17,10 @@
|
|||
# version for the CNTK custom MKL installation
|
||||
# MKL_THREADING=parallel|sequential
|
||||
# only needed if MATHLIB=mkl
|
||||
# GDK_PATH= path to cuda gdk installation, so $(GDK_PATH)/include/nvidia/gdk/nvml.h exists
|
||||
# defaults to /usr
|
||||
# GDK_INCLUDE_PATH= path to CUDA GDK include path, so $(GDK_INCLUDE_PATH)/nvml.h exists
|
||||
# defaults to /usr/include/nvidia/gdk
|
||||
# GDK_NVML_LIB_PATH= path to CUDA GDK (stub) library path, so $(GDK_NVML_LIB_PATH)/libnvidia-ml.so exists
|
||||
# defaults to /usr/src/gdk/nvml/lib
|
||||
# MATHLIB= One of acml or mkl
|
||||
# defaults to acml
|
||||
# CUDA_PATH= Path to CUDA
|
||||
|
@ -29,10 +31,12 @@
|
|||
# If not specified, CNTK will be be built without cuDNN.
|
||||
# KALDI_PATH= Path to Kaldi
|
||||
# If not specified, Kaldi plugins will not be built
|
||||
# OPENCV_PATH= path to OpenCV 3.0.0 installation, so $(OPENCV_PATH) exists
|
||||
# defaults to /usr/local/opencv-3.0.0
|
||||
# OPENCV_PATH= path to OpenCV 3.1.0 installation, so $(OPENCV_PATH) exists
|
||||
# defaults to /usr/local/opencv-3.1.0
|
||||
# LIBZIP_PATH= path to libzip installation, so $(LIBZIP_PATH) exists
|
||||
# defaults to /usr/local/
|
||||
# BOOST_PATH= path to Boost installation, so $(BOOST_PATH)/include/boost/test/unit_test.hpp
|
||||
# defaults to /usr/local/boost-1.60.0
|
||||
# These can be overridden on the command line, e.g. make BUILDTYPE=debug
|
||||
|
||||
# TODO: Build static libraries for common dependencies that are shared by multiple
|
||||
|
@ -71,7 +75,7 @@ INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2L
|
|||
# COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
|
||||
COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
|
||||
CPPFLAGS:=
|
||||
CXXFLAGS:= -msse3 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
|
||||
CXXFLAGS:= -msse4.1 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
|
||||
LIBPATH:=
|
||||
LIBS:=
|
||||
LDFLAGS:=
|
||||
|
@ -93,9 +97,14 @@ all : buildall
|
|||
CUFLAGS = -m 64
|
||||
|
||||
ifdef CUDA_PATH
|
||||
ifndef GDK_PATH
|
||||
$(info defaulting GDK_PATH to /usr)
|
||||
GDK_PATH=/usr
|
||||
ifndef GDK_INCLUDE_PATH
|
||||
GDK_INCLUDE_PATH=/usr/include/nvidia/gdk
|
||||
$(info defaulting GDK_INCLUDE_PATH to $(GDK_INCLUDE_PATH))
|
||||
endif
|
||||
|
||||
ifndef GDK_NVML_LIB_PATH
|
||||
GDK_NVML_LIB_PATH=/usr/src/gdk/nvml/lib
|
||||
$(info defaulting GDK_NVML_LIB_PATH to $(GDK_NVML_LIB_PATH))
|
||||
endif
|
||||
|
||||
ifndef CUB_PATH
|
||||
|
@ -107,10 +116,8 @@ ifdef CUDA_PATH
|
|||
|
||||
NVCC = $(CUDA_PATH)/bin/nvcc
|
||||
|
||||
# This is a suggested/default location for NVML
|
||||
INCLUDEPATH+=$(GDK_PATH)/include/nvidia/gdk
|
||||
INCLUDEPATH+=$(GDK_INCLUDE_PATH)
|
||||
INCLUDEPATH+=$(CUB_PATH)
|
||||
NVMLLIBPATH=$(GDK_PATH)/src/gdk/nvml/lib
|
||||
|
||||
# Set up CUDA includes and libraries
|
||||
INCLUDEPATH += $(CUDA_PATH)/include
|
||||
|
@ -328,7 +335,7 @@ $(CNTKMATH_LIB): $(MATH_OBJ)
|
|||
@echo $(SEPARATOR)
|
||||
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
@mkdir -p $(dir $@)
|
||||
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp
|
||||
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp
|
||||
|
||||
########################################
|
||||
# CNTKLibrary
|
||||
|
@ -368,13 +375,17 @@ SEQUENCE_TRAINING_LIB_SRC +=\
|
|||
endif
|
||||
|
||||
CNTKLIBRARY_SRC =\
|
||||
$(SOURCEDIR)/CNTKv2LibraryDll/BackCompat.cpp \
|
||||
$(SOURCEDIR)/CNTKv2LibraryDll/Common.cpp \
|
||||
$(SOURCEDIR)/CNTKv2LibraryDll/Function.cpp \
|
||||
$(SOURCEDIR)/CNTKv2LibraryDll/MinibatchSource.cpp \
|
||||
$(SOURCEDIR)/CNTKv2LibraryDll/NDArrayView.cpp \
|
||||
$(SOURCEDIR)/CNTKv2LibraryDll/NDMask.cpp \
|
||||
$(SOURCEDIR)/CNTKv2LibraryDll/Trainer.cpp \
|
||||
$(SOURCEDIR)/CNTKv2LibraryDll/Utils.cpp \
|
||||
$(SOURCEDIR)/CNTKv2LibraryDll/Value.cpp \
|
||||
$(SOURCEDIR)/CNTKv2LibraryDll/Variable.cpp \
|
||||
$(SOURCEDIR)/CNTKv2LibraryDll/Learner.cpp \
|
||||
|
||||
CNTKLIBRARY_SRC+=$(CNTK_COMMON_SRC)
|
||||
CNTKLIBRARY_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
|
||||
|
@ -393,7 +404,7 @@ $(CNTKLIBRARY_LIB): $(CNTKLIBRARY_OBJ) | $(CNTKMATH_LIB)
|
|||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building output for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)
|
||||
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)
|
||||
|
||||
########################################
|
||||
# CNTKLibrary tests
|
||||
|
@ -405,6 +416,8 @@ CNTKLIBRARY_TESTS_SRC =\
|
|||
Tests/UnitTests/V2LibraryTests/NDArrayViewTests.cpp \
|
||||
Tests/UnitTests/V2LibraryTests/RecurrentFunctionTests.cpp \
|
||||
Tests/UnitTests/V2LibraryTests/TensorTests.cpp \
|
||||
Tests/UnitTests/V2LibraryTests/TrainerTests.cpp \
|
||||
Tests/UnitTests/V2LibraryTests/CifarResNet.cpp \
|
||||
|
||||
CNTKLIBRARY_TESTS:=$(BINDIR)/v2librarytests
|
||||
CNTKLIBRARY_TESTS_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_TESTS_SRC)))
|
||||
|
@ -416,7 +429,7 @@ $(CNTKLIBRARY_TESTS): $(CNTKLIBRARY_TESTS_OBJ) | $(CNTKLIBRARY_LIB)
|
|||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building output for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) -l$(CNTKMATH)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) -l$(CNTKMATH)
|
||||
|
||||
########################################
|
||||
# LibEval
|
||||
|
@ -437,7 +450,7 @@ EVAL_SRC=\
|
|||
$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
|
||||
$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
|
||||
$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
|
||||
$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp
|
||||
$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
|
||||
|
||||
EVAL_SRC+=$(SGDLIB_SRC)
|
||||
EVAL_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
|
||||
|
@ -450,11 +463,11 @@ EVAL_LIB:=$(LIBDIR)/lib$(EVAL).so
|
|||
ALL+=$(EVAL_LIB)
|
||||
SRC+=$(EVAL_SRC)
|
||||
|
||||
$(EVAL_LIB): $(EVAL_OBJ)
|
||||
$(EVAL_LIB): $(EVAL_OBJ) | $(CNTKMATH_LIB)
|
||||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo Building $(EVAL_LIB) for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS)
|
||||
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)
|
||||
|
||||
########################################
|
||||
# Eval Sample client
|
||||
|
@ -469,11 +482,11 @@ EVAL_SAMPLE_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_SAMPLE_CLIENT_SR
|
|||
ALL+=$(EVAL_SAMPLE_CLIENT)
|
||||
SRC+=$(EVAL_SAMPLE_CLIENT_SRC)
|
||||
|
||||
$(EVAL_SAMPLE_CLIENT): $(EVAL_SAMPLE_CLIENT_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
|
||||
$(EVAL_SAMPLE_CLIENT): $(EVAL_SAMPLE_CLIENT_OBJ) | $(EVAL_LIB)
|
||||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $(EVAL_SAMPLE_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ -l$(EVAL) -l$(CNTKMATH)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)
|
||||
|
||||
########################################
|
||||
# BinaryReader plugin
|
||||
|
@ -770,7 +783,6 @@ CNTK_SRC =\
|
|||
$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
|
||||
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
|
||||
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
|
||||
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptTest.cpp \
|
||||
|
||||
CNTK_SRC+=$(SGDLIB_SRC)
|
||||
CNTK_SRC+=$(CNTK_COMMON_SRC)
|
||||
|
@ -787,7 +799,7 @@ $(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
|
|||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building output for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp
|
||||
|
||||
# deployable resources: standard library of BS
|
||||
CNTK_CORE_BS:=$(BINDIR)/cntk.core.bs
|
||||
|
@ -797,6 +809,151 @@ $(CNTK_CORE_BS): $(SOURCEDIR)/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
|
|||
@echo bin-placing deployable resource files
|
||||
cp -f $^ $@
|
||||
|
||||
########################################
|
||||
# Unit Tests
|
||||
########################################
|
||||
|
||||
# only build unit tests when Boost is available
|
||||
ifdef BOOST_PATH
|
||||
|
||||
INCLUDEPATH += $(BOOST_PATH)/include
|
||||
|
||||
BOOSTLIB_PATH = $(BOOST_PATH)/lib
|
||||
BOOSTLIBS := -lboost_unit_test_framework -lboost_filesystem -lboost_system
|
||||
|
||||
UNITTEST_EVAL_SRC = \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/EvalTests/EvalExtendedTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/EvalTests/stdafx.cpp
|
||||
|
||||
UNITTEST_EVAL_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_EVAL_SRC))
|
||||
|
||||
UNITTEST_EVAL := $(BINDIR)/evaltests
|
||||
|
||||
ALL += $(UNITTEST_EVAL)
|
||||
SRC += $(UNITTEST_EVAL_SRC)
|
||||
|
||||
$(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
|
||||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(EVAL) -l$(CNTKMATH)
|
||||
|
||||
#TODO: create project specific makefile or rules to avoid adding project specific path to the global path
|
||||
INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader
|
||||
|
||||
UNITTEST_READER_SRC = \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/HTKLMFReaderTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ImageReaderTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ReaderLibTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/UCIFastReaderTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/stdafx.cpp \
|
||||
$(SOURCEDIR)/Readers/CNTKTextFormatReader/Indexer.cpp \
|
||||
$(SOURCEDIR)/Readers/CNTKTextFormatReader/TextParser.cpp \
|
||||
|
||||
UNITTEST_READER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_READER_SRC))
|
||||
|
||||
UNITTEST_READER := $(BINDIR)/readertests
|
||||
|
||||
ALL += $(UNITTEST_READER)
|
||||
SRC += $(UNITTEST_READER_SRC)
|
||||
|
||||
$(UNITTEST_READER): $(UNITTEST_READER_OBJ) | $(HTKMLFREADER) $(HTKDESERIALIZERS) $(UCIFASTREADER) $(COMPOSITEDATAREADER) $(IMAGEREADER) $(CNTKMATH_LIB)
|
||||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) -l$(CNTKMATH) -ldl
|
||||
|
||||
UNITTEST_NETWORK_SRC = \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/OperatorEvaluation.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/stdafx.cpp \
|
||||
$(SOURCEDIR)/CNTK/ModelEditLanguage.cpp \
|
||||
$(SOURCEDIR)/ActionsLib/TrainActions.cpp \
|
||||
$(SOURCEDIR)/ActionsLib/EvalActions.cpp \
|
||||
$(SOURCEDIR)/ActionsLib/OtherActions.cpp \
|
||||
$(SOURCEDIR)/ActionsLib/SpecialPurposeActions.cpp \
|
||||
$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
|
||||
$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
|
||||
$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
|
||||
$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
|
||||
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
|
||||
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
|
||||
|
||||
UNITTEST_NETWORK_SRC += $(COMPUTATION_NETWORK_LIB_SRC)
|
||||
UNITTEST_NETWORK_SRC += $(CNTK_COMMON_SRC)
|
||||
UNITTEST_NETWORK_SRC += $(SEQUENCE_TRAINING_LIB_SRC)
|
||||
UNITTEST_NETWORK_SRC += $(SGDLIB_SRC)
|
||||
|
||||
UNITTEST_NETWORK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_NETWORK_SRC)))
|
||||
|
||||
UNITTEST_NETWORK := $(BINDIR)/networktests
|
||||
|
||||
ALL += $(UNITTEST_NETWORK)
|
||||
SRC += $(UNITTEST_NETWORK_SRC)
|
||||
|
||||
$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER)
|
||||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(CNTKMATH) -fopenmp
|
||||
|
||||
UNITTEST_MATH_SRC = \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/BlockMultiplierTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/constants.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUMatrixTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUSparseMatrixTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/fixtures.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixCudaBlasTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixBlasTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixDataSynchronizationTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixFileWriteReadTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixSparseDenseInteractionsTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \
|
||||
|
||||
UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))
|
||||
|
||||
UNITTEST_MATH := $(BINDIR)/mathtests
|
||||
|
||||
ALL += $(UNITTEST_MATH)
|
||||
SRC += $(UNITTEST_MATH_SRC)
|
||||
|
||||
$(UNITTEST_MATH): $(UNITTEST_MATH_OBJ) | $(CNTKMATH_LIB)
|
||||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(CNTKMATH) -ldl -fopenmp
|
||||
|
||||
UNITTEST_BRAINSCRIPT_SRC = \
|
||||
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
|
||||
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/ParserTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/stdafx.cpp
|
||||
|
||||
UNITTEST_BRAINSCRIPT_SRC+=$(COMMON_SRC)
|
||||
|
||||
UNITTEST_BRAINSCRIPT_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_BRAINSCRIPT_SRC))
|
||||
|
||||
UNITTEST_BRAINSCRIPT := $(BINDIR)/brainscripttests
|
||||
|
||||
ALL += $(UNITTEST_BRAINSCRIPT)
|
||||
SRC += $(UNITTEST_BRAINSCRIPT_SRC)
|
||||
|
||||
$(UNITTEST_BRAINSCRIPT): $(UNITTEST_BRAINSCRIPT_OBJ)
|
||||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -ldl
|
||||
|
||||
unittests: $(UNITTEST_EVAL) $(UNITTEST_READER) $(UNITTEST_NETWORK) $(UNITTEST_MATH) $(UNITTEST_BRAINSCRIPT)
|
||||
|
||||
endif
|
||||
|
||||
########################################
|
||||
# General compile and dependency rules
|
||||
########################################
|
||||
|
@ -821,13 +978,13 @@ $(OBJDIR)/%.o : %.cu $(BUILD_CONFIGURATION)
|
|||
@mkdir -p $(dir $@)
|
||||
$(NVCC) -c $< -o $@ $(COMMON_FLAGS) $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler "-fPIC -Werror"
|
||||
|
||||
$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
|
||||
$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
|
||||
@echo $(SEPARATOR)
|
||||
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
@mkdir -p $(dir $@)
|
||||
$(CXX) -c $< -o $@ $(COMMON_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}
|
||||
|
||||
.PHONY: clean buildall all
|
||||
.PHONY: clean buildall all unittests
|
||||
|
||||
clean:
|
||||
@echo $(SEPARATOR)
|
||||
|
|
11
README.md
11
README.md
|
@ -1,6 +1,11 @@
|
|||
# CNTK
|
||||
|
||||
## Latest news
|
||||
*2016-07-15.* V 1.6 Binary release
|
||||
CNTK v.1.6 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
|
||||
|
||||
*2016-07-12.* We have further expanded Licensing options for CNTK 1bit-SGD and related components. See the details at the [Wiki page](https://github.com/microsoft/cntk/wiki/CNTK-1bit-SGD-License). These new options are an extension of the new CNTK 1bit-SGD License that we have announced on Jun 23, 2016.
|
||||
|
||||
*2016-07-05.* CNTK now supports *Deconvolution* and *Unpooling*. See the usage example in the Network number 4 in [MNIST Sample](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/README.md).
|
||||
|
||||
*2016-06-23.* New License Terms for CNTK 1bit-SGD and related components.
|
||||
|
@ -8,12 +13,6 @@ Effective immediately the License Terms for CNTK 1bit-SGD and related components
|
|||
|
||||
*2016-06-20.* A [post](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/) on Intel MKL and CNTK is published in the [Intel IT Peer Network](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/)
|
||||
|
||||
*2016-06-16.* V 1.5 Binary release. NuGet Package with CNTK Model Evaluation Libraries.
|
||||
NuGet Package is added to CNTK v.1.5 binaries. See [CNTK Releases page](https://github.com/Microsoft/CNTK/releases) and [NuGet Package description](https://github.com/Microsoft/CNTK/wiki/Nuget-Package-for-Evaluation).
|
||||
|
||||
*2016-06-15.* CNTK now supports building against a custom Intel® Math Kernel Library (MKL).
|
||||
See [setup instructions](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine) on how to set this up for your platform.
|
||||
|
||||
See [all news](https://github.com/Microsoft/CNTK/wiki/News).
|
||||
|
||||
## What is CNTK
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
This directory contains different script helping using different components of CNTK.
|
||||
|
||||
### CNTK Text format Converters
|
||||
Two Python Scripts for converting Data to CNTK Text format for using as an input for CNTK Text Format Reader (see https://github.com/microsoft/cnTK/wiki/CNTKTextFormat-Reader).
|
||||
```
|
||||
txt2ctf.py
|
||||
```
|
||||
Converts a set of dictionary files and a plain text file to CNTK Text format. Run ```python txt2ctf.py -h``` to see usage instructions. See the comments in the beginning of the script file for the specific usage example.
|
||||
|
||||
```
|
||||
uci2ctf.py
|
||||
```
|
||||
Converts data stored in a text file in UCI format to CNTK Text format. Run ```python uci2ctf.py -h``` to see usage instructions and example. Also see a usage example below:
|
||||
```
|
||||
python Scripts/uci2ctf.py --input_file Examples/Image/MNIST/Data/Train-28x28.txt --features_start 1 --features_dim 784 --labels_start 0 --labels_dim 1 --num_labels 10 --output_file Examples/Image/MNIST/Data/Train-28x28_cntk_text.txt
|
||||
```
|
||||
```input_file``` – original dataset in the (columnar) UCI format
|
||||
```features_start``` – index of the first feature column (start parameter in the UCIFastReader config, see https://github.com/Microsoft/CNTK/wiki/UCI-Fast-Reader)
|
||||
```features_dim``` – number of feature columns (dim parameter in the UCIFastReader config)
|
||||
```labels_start``` - index of the first label column
|
||||
```labels_dim``` – number of label columns
|
||||
```num_labels``` – number of possible label values (labelDim parameter in the UCIFastReader config)
|
||||
```output_file``` – path and filename of the resulting dataset.
|
||||
|
|
@ -1 +1 @@
|
|||
Subproject commit c9821dd5565d4654841eaba819b655c9db2fe85b
|
||||
Subproject commit f7afb8c6a08a6652d84de1b62377175788be5284
|
|
@ -149,11 +149,11 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
int forcedRandomSeed = node->GetOptionalParameter("randomSeed", "-1" /*disabled*/);
|
||||
|
||||
if (EqualCI(initString, L"fixedValue"))
|
||||
nodePtr->Value().SetValue(value);
|
||||
m_net->InitLearnableParameters(nodePtr, L"fixedValue", value);
|
||||
else if (EqualCI(initString, L"uniform"))
|
||||
m_net->InitLearnableParameters(nodePtr, true, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long) forcedRandomSeed, initValueScale, initOnCPUOnly);
|
||||
m_net->InitLearnableParameters(nodePtr, L"uniform", initValueScale, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed, initOnCPUOnly);
|
||||
else if (EqualCI(initString, L"gaussian"))
|
||||
m_net->InitLearnableParameters(nodePtr, false, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long) forcedRandomSeed, initValueScale, initOnCPUOnly);
|
||||
m_net->InitLearnableParameters(nodePtr, L"gaussian", initValueScale, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed, initOnCPUOnly);
|
||||
else if (EqualCI(initString, L"fromFile"))
|
||||
{
|
||||
std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
|
||||
|
@ -167,7 +167,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
dynamic_pointer_cast<LearnableParameter<ElemType>>(nodePtr)->InitFromFile(msra::strfun::utf16(initFromFilePath));
|
||||
}
|
||||
else
|
||||
RuntimeError("'init' must be one of the values of [ uniform | gaussian | fixedValue ]");
|
||||
RuntimeError("'init' must be one of the values of [ uniform | gaussian | fixedValue | fromFile ]");
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == L"Constant")
|
||||
|
@ -186,7 +186,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
else if (pass == ndlPassFinal || nodePtr->Value().GetNumElements() != 0)
|
||||
{
|
||||
ElemType val = parameter[0]->GetScalar();
|
||||
nodePtr->Value().SetValue(val);
|
||||
m_net->InitLearnableParameters(nodePtr, L"fixedValue", val);
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == L"RowSlice") // Note: This now maps onto SliceNode which specifies the end differently.
|
||||
|
@ -304,7 +304,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
"1. 2D convolution which takes 7 fixed parameters [weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample] \n"
|
||||
"and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"HWC\"|\"cudnn\"]. \n"
|
||||
"2. ND convolution which takes 3 fixed parameters [weightNodeName, inputValueNodeName, kernelShape] and \n"
|
||||
"10 optional parameters [mapCount = [1|yourvalue], stride = [1|yourvalue], sharing = [true|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], bool transpose = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
|
||||
"10 optional parameters [mapCount = [0|yourvalue], stride = [1|yourvalue], sharing = [true|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], bool transpose = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
|
||||
"For ND convolution, parameters kernelShape, mapCount, stride, sharing, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
|
||||
cnNodeType.c_str(), cnNodeType.c_str());
|
||||
}
|
||||
|
@ -380,7 +380,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
};
|
||||
|
||||
auto kernelShape = paramGetter(reqParams.size() - 1);
|
||||
auto mapCount = paramResolver("mapCount", 1);
|
||||
auto mapCount = paramResolver("mapCount", 0);
|
||||
auto stride = paramResolver("stride", 1);
|
||||
auto sharing = boolParamResolver("sharing", true);
|
||||
auto autoPad = boolParamResolver("autoPadding", true);
|
||||
|
|
|
@ -158,12 +158,12 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
|
|||
else if (EqualInsensitive(nodeType, OperationNameOf(CRFNode), L"CRF")) ret = true;
|
||||
#endif
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode), L"CBCEWithSM")) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(EqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterEqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(LessEqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(LessNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(NotEqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(EqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterEqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(LessEqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(LessNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(NotEqualNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(ClipNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(ConvolutionNode), L"Convolve")) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true;
|
||||
|
@ -263,4 +263,5 @@ template class NDLNode<double>;
|
|||
|
||||
template class NDLScript<float>;
|
||||
template class NDLScript<double>;
|
||||
} } }
|
||||
|
||||
}}}
|
||||
|
|
|
@ -95,8 +95,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildFFDNNFromDescription(
|
|||
if (numHiddenLayers > 0)
|
||||
{
|
||||
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[0]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
b = builder.CreateLearnableParameter(L"B0", m_layerSizes[1], 1);
|
||||
m_net->InitLearnableParameters(b, L"fixedValue", 0);
|
||||
output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, 1, L"W0*features"), b, L"W0*features+B0"), 0, L"H1");
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
|
@ -114,8 +115,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildFFDNNFromDescription(
|
|||
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
|
||||
|
||||
w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[i + 1], m_layerSizes[i]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[i + 1], 1);
|
||||
m_net->InitLearnableParameters(b, L"fixedValue", 0);
|
||||
output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus), i, nameOfH);
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
|
@ -132,8 +134,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildFFDNNFromDescription(
|
|||
wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
|
||||
|
||||
w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[numHiddenLayers + 1], 1);
|
||||
m_net->InitLearnableParameters(b, L"fixedValue", 0);
|
||||
output = builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus);
|
||||
m_net->RenameNode(output, L"HLast");
|
||||
|
||||
|
@ -198,12 +201,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildRNNFromDescription()
|
|||
{
|
||||
// TODO: to figure out sparse matrix size
|
||||
u = builder.CreateLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0]);
|
||||
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == 1)
|
||||
{
|
||||
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
|
||||
// unless there is a good algorithm to detect loops, use this explicit setup
|
||||
|
@ -230,12 +233,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildRNNFromDescription()
|
|||
{
|
||||
// TODO: to figure out sparse matrix size
|
||||
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
|
||||
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
|
||||
{
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t) m_layerSizes[i + 1], 1);
|
||||
// unless there is a good algorithm to detect loops, use this explicit setup
|
||||
|
@ -259,7 +262,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildRNNFromDescription()
|
|||
}
|
||||
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
/*m_net->MatrixL2Reg(w , L"L1w");*/
|
||||
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
|
||||
|
@ -311,12 +314,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
|
|||
if (numHiddenLayers > 0)
|
||||
{
|
||||
u = builder.CreateLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0]);
|
||||
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == 1)
|
||||
{
|
||||
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
|
||||
// unless there is a good algorithm to detect loops, use this explicit setup
|
||||
|
@ -330,7 +333,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
|
|||
else
|
||||
{
|
||||
b = builder.CreateLearnableParameter(L"B0", m_layerSizes[1], 1);
|
||||
m_net->InitLearnableParameters(b, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(b, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), 0);
|
||||
}
|
||||
|
||||
|
@ -342,11 +345,11 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
|
|||
for (int i = 1; i < numHiddenLayers; i++)
|
||||
{
|
||||
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
|
||||
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
|
||||
{
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t) m_layerSizes[i + 1], 1);
|
||||
// unless there is a good algorithm to detect loops, use this explicit setup
|
||||
|
@ -373,13 +376,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
|
|||
// e.g., [200 x 10000], where 10000 is the vocabulary size
|
||||
// this is for speed-up issue as per word matrix can be simply obtained using column slice
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
// the label is a dense matrix. each element is the word index
|
||||
label = builder.CreateInputNode(L"labels", 4);
|
||||
|
||||
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");
|
||||
|
||||
output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
|
||||
|
@ -428,7 +431,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetwor
|
|||
if (m_lookupTableOrder > 0)
|
||||
{
|
||||
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
|
||||
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
output = builder.LookupTable(e, input, L"LookupTable");
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
|
@ -464,9 +467,8 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetwor
|
|||
// serve as a global bias term
|
||||
gt = builder.CreateInputNode(L"binaryFeature", m_auxFeatDim);
|
||||
m_net->AddToNodeGroup(L"feature", gt);
|
||||
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"AuxTrans%d", 0),
|
||||
m_layerSizes[numHiddenLayers], m_auxFeatDim);
|
||||
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"AuxTrans%d", 0), m_layerSizes[numHiddenLayers], m_auxFeatDim);
|
||||
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
u = ApplyNonlinearFunction(builder.Times(e, gt), numHiddenLayers, L"TimesToGetGlobalBias");
|
||||
output = builder.Plus(input, u, L"PlusGlobalBias");
|
||||
input = output;
|
||||
|
@ -475,13 +477,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetwor
|
|||
// e.g., [200 x 10000], where 10000 is the vocabulary size
|
||||
// this is for speed-up issue as per word matrix can be simply obtained using column slice
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
// the label is a dense matrix. each element is the word index
|
||||
label = builder.CreateInputNode(L"labels", 4);
|
||||
|
||||
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");
|
||||
|
||||
output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
|
||||
|
@ -535,7 +537,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
|
|||
if (m_lookupTableOrder > 0)
|
||||
{
|
||||
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
|
||||
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
output = builder.LookupTable(e, input, L"Lookuptatble");
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
|
@ -556,7 +558,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
|
|||
pastValueXI->AttachInputs({ input });
|
||||
// TODO: to figure out sparse matrix size
|
||||
Wxi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"DD%d", ik), m_layerSizes[0], m_layerSizes[0]);
|
||||
m_net->InitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
it = builder.Plus(output, builder.Times(Wxi, pastValueXI));
|
||||
output = it;
|
||||
|
@ -572,13 +574,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
|
|||
for (int i = m_lookupTableOrder > 0 ? 1 : 0; i < numHiddenLayers; i++)
|
||||
{
|
||||
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i] * (m_lookupTableOrder > 0 ? m_lookupTableOrder : 1));
|
||||
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
output = builder.Times(u, input);
|
||||
input = output;
|
||||
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
|
||||
{
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"R%d", i + 1), m_layerSizes[i + 1], m_layerSizes[i + 1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i + 1], 1);
|
||||
output = builder.Plus(builder.Times(w, pastValue), input);
|
||||
|
||||
|
@ -588,6 +590,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
|
|||
}
|
||||
|
||||
bi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bi%d", i), m_layerSizes[i + 1], 1);
|
||||
m_net->InitLearnableParameters(bi, L"fixedValue", 0);
|
||||
output = builder.Plus(input, bi);
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
|
@ -597,7 +600,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
|
|||
}
|
||||
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
|
||||
AddTrainAndEvalCriterionNodes(input, label, w);
|
||||
|
@ -650,6 +653,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr
|
|||
if (numHiddenLayers > 0)
|
||||
{
|
||||
bi = builder.CreateLearnableParameter(L"bi0", m_layerSizes[1], 1);
|
||||
m_net->InitLearnableParameters(bi, L"fixedValue", 0);
|
||||
|
||||
pastValueXI = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 1);
|
||||
pastValueXII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 2);
|
||||
|
@ -664,19 +668,19 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr
|
|||
{
|
||||
// TODO: to figure out sparse matrix size
|
||||
Wxi2 = builder.CreateLearnableParameter(L"WXI2", m_layerSizes[1], m_layerSizes[0]);
|
||||
m_net->InitLearnableParameters(Wxi2, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wxi2, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
// TODO: to figure out sparse matrix size
|
||||
Wxi3 = builder.CreateLearnableParameter(L"WXI3", m_layerSizes[1], m_layerSizes[0]);
|
||||
m_net->InitLearnableParameters(Wxi3, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wxi3, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
// TODO: to figure out sparse matrix size
|
||||
Wxi4 = builder.CreateLearnableParameter(L"WXI4", m_layerSizes[1], m_layerSizes[0]);
|
||||
m_net->InitLearnableParameters(Wxi4, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wxi4, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
// TODO: to figure out sparse matrix size
|
||||
Wxi1 = builder.CreateLearnableParameter(L"WXI1", m_layerSizes[1], m_layerSizes[0]);
|
||||
m_net->InitLearnableParameters(Wxi1, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wxi1, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
// TODO: to figure out sparse matrix size
|
||||
Wxi = builder.CreateLearnableParameter(L"WXI", m_layerSizes[1], m_layerSizes[0]);
|
||||
m_net->InitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
// unless there is a good algorithm to detect loops, use this explicit setup
|
||||
it = builder.Plus(
|
||||
|
@ -711,11 +715,11 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr
|
|||
for (int i = 1; i < numHiddenLayers; i++)
|
||||
{
|
||||
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
|
||||
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
|
||||
{
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
std::list<ComputationNodeBasePtr> recurrent_loop;
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i + 1], 1);
|
||||
output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), builder.Times(w, pastValue)), i);
|
||||
|
@ -736,8 +740,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr
|
|||
|
||||
// TODO: to figure out sparse matrix size
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
// b = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"B%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], 1);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
//b = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"B%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], 1);
|
||||
//m_net->InitLearnableParameters(b, L"fixedValue", 0);
|
||||
label = builder.CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
|
||||
AddTrainAndEvalCriterionNodes(input, label, w);
|
||||
|
||||
|
@ -766,11 +771,11 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
|
|||
if (m_directConnect[i] == iLayer)
|
||||
{
|
||||
ComputationNodePtr directWIO = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"D%d", i), outputDim, inputDim);
|
||||
m_net->InitLearnableParameters(directWIO, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(directWIO, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
directOutput = ApplyNonlinearFunction(builder.Times(directWIO, input), i);
|
||||
|
||||
ComputationNodePtr scalar = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"SV%d", i), 1, 1);
|
||||
scalar->Value().SetValue((ElemType) 0.01);
|
||||
m_net->InitLearnableParameters(scalar, L"fixedValue", (ElemType) 0.01);
|
||||
ComputationNodePtr scaled = builder.ElementTimes(scalar, directOutput, msra::strfun::wstrprintf(L"S%d", i));
|
||||
|
||||
mergedNode = builder.Plus(toNode, scaled);
|
||||
|
@ -801,39 +806,38 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
|
|||
Wxf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WXF%d", iLayer), outputDim, inputDim);
|
||||
Wxc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WXC%d", iLayer), outputDim, inputDim);
|
||||
|
||||
m_net->InitLearnableParameters(Wxo, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->InitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->InitLearnableParameters(Wxf, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->InitLearnableParameters(Wxc, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wxo, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wxf, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wxc, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
bo = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bo%d", iLayer), outputDim, 1);
|
||||
bc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bc%d", iLayer), outputDim, 1);
|
||||
bi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bi%d", iLayer), outputDim, 1);
|
||||
bf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bf%d", iLayer), outputDim, 1);
|
||||
// if (m_forgetGateInitVal > 0)
|
||||
bf->Value().SetValue(m_forgetGateInitVal);
|
||||
// if (m_inputGateInitVal > 0)
|
||||
bi->Value().SetValue(m_inputGateInitVal);
|
||||
// if (m_outputGateInitVal > 0)
|
||||
bo->Value().SetValue(m_outputGateInitVal);
|
||||
|
||||
m_net->InitLearnableParameters(bi, L"fixedValue", m_inputGateInitVal);
|
||||
m_net->InitLearnableParameters(bc, L"fixedValue", 0);
|
||||
m_net->InitLearnableParameters(bo, L"fixedValue", m_outputGateInitVal);
|
||||
m_net->InitLearnableParameters(bf, L"fixedValue", m_forgetGateInitVal);
|
||||
|
||||
Whi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHI%d", iLayer), outputDim, outputDim);
|
||||
m_net->InitLearnableParameters(Whi, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Whi, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
Wci = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCI%d", iLayer), outputDim, 1);
|
||||
m_net->InitLearnableParameters(Wci, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wci, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
Whf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHF%d", iLayer), outputDim, outputDim);
|
||||
m_net->InitLearnableParameters(Whf, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Whf, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
Wcf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCF%d", iLayer), outputDim, 1);
|
||||
m_net->InitLearnableParameters(Wcf, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wcf, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
Who = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHO%d", iLayer), outputDim, outputDim);
|
||||
m_net->InitLearnableParameters(Who, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Who, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
Wco = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCO%d", iLayer), outputDim, 1);
|
||||
m_net->InitLearnableParameters(Wco, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Wco, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
Whc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHC%d", iLayer), outputDim, outputDim);
|
||||
m_net->InitLearnableParameters(Whc, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(Whc, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
size_t layer1 = outputDim;
|
||||
|
||||
|
@ -848,8 +852,8 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
|
|||
if (m_constInputGateValue)
|
||||
{
|
||||
// it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim);
|
||||
// m_net->InitLearnableParameters(it, L"fixedValue", m_constInputGateValue);
|
||||
// it->SetLearningRateMultiplier(0);
|
||||
// it->Value().SetValue(m_constInputGateValue);
|
||||
it = nullptr;
|
||||
}
|
||||
else
|
||||
|
@ -988,7 +992,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCRFLSTMNetworkFromDes
|
|||
if (m_lookupTableOrder > 0)
|
||||
{
|
||||
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
|
||||
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
output = builder.LookupTable(e, input, L"LookupTable");
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
|
@ -1017,8 +1021,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCRFLSTMNetworkFromDes
|
|||
else
|
||||
{
|
||||
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i] * (offset ? m_lookupTableOrder : 1));
|
||||
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
|
||||
m_net->InitLearnableParameters(b, L"fixedValue", 0);
|
||||
output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
|
||||
}
|
||||
|
||||
|
@ -1030,13 +1035,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCRFLSTMNetworkFromDes
|
|||
}
|
||||
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"TimesBeforeSoftMax%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
output = builder.Times(w, input, L"outputsBeforeSoftmax");
|
||||
|
||||
trans = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"TransProb%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers + 1]);
|
||||
trans->Value().SetValue((ElemType) 1.0 / m_layerSizes[numHiddenLayers + 1]);
|
||||
// m_net->InitLearnableParameters(trans, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->InitLearnableParameters(trans, L"fixedValue", (ElemType) 1.0 / m_layerSizes[numHiddenLayers + 1]);
|
||||
// m_net->RandomInitLearnableParameters(trans, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
trans->SetLearningRateMultiplier(1.0f);
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
|
||||
AddTrainAndEvalCriterionNodes(output, label, nullptr, L"CRFTrainCriterion", L"CRFEvalCriterion", nullptr, trans);
|
||||
|
@ -1085,7 +1090,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassLSTMNetworkFromD
|
|||
if (m_lookupTableOrder > 0)
|
||||
{
|
||||
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
|
||||
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
output = builder.LookupTable(e, input, L"LookupTable");
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
|
@ -1122,13 +1127,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassLSTMNetworkFromD
|
|||
// e.g., [200 x 10000], where 10000 is the vocabulary size
|
||||
// this is for speed-up issue as per word matrix can be simply obtained using column slice
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
// the label is a dense matrix. each element is the word index
|
||||
label = builder.CreateInputNode(L"labels", 4);
|
||||
|
||||
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");
|
||||
|
||||
output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
|
||||
|
@ -1164,16 +1169,16 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
|
|||
input = inputObs;
|
||||
size_t nDim = inputDim + outputDim + 2;
|
||||
wInputGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WINPUTGATE%d", iLayer), outputDim, nDim);
|
||||
m_net->InitLearnableParameters(wInputGate, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(wInputGate, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
wInputGate->Value().ColumnSlice(0, 1).SetValue(m_inputGateInitVal); // init to input gate bias
|
||||
wForgetGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WFORGETGATE%d", iLayer), outputDim, nDim);
|
||||
m_net->InitLearnableParameters(wForgetGate, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(wForgetGate, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
wForgetGate->Value().ColumnSlice(0, 1).SetValue(m_forgetGateInitVal); // init to forget gate bias
|
||||
wOutputGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WOUTPUTGATE%d", iLayer), outputDim, nDim);
|
||||
m_net->InitLearnableParameters(wOutputGate, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(wOutputGate, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
wOutputGate->Value().ColumnSlice(0, 1).SetValue(m_outputGateInitVal); // init to output gate bias
|
||||
wMemoryCellMatrix = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WMEMORYCELLWEIGHT%d", iLayer), outputDim, inputDim + outputDim + 1);
|
||||
m_net->InitLearnableParameters(wMemoryCellMatrix, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(wMemoryCellMatrix, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
wMemoryCellMatrix->Value().ColumnSlice(0, 1).SetValue(0); // init to memory cell bias
|
||||
|
||||
output = builder.LSTM(inputObs, wInputGate, wForgetGate, wOutputGate, wMemoryCellMatrix, msra::strfun::wstrprintf(L"LSTM%d", iLayer));
|
||||
|
@ -1234,7 +1239,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescri
|
|||
if (m_lookupTableOrder > 0)
|
||||
{
|
||||
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
|
||||
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
output = builder.LookupTable(e, input, L"LookupTable");
|
||||
#ifdef DEBUG_DECODER
|
||||
e->Value().SetValue((ElemType) 0.01);
|
||||
|
@ -1275,8 +1280,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescri
|
|||
else
|
||||
{
|
||||
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
|
||||
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
|
||||
m_net->InitLearnableParameters(b, L"fixedValue", 0);
|
||||
output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
|
||||
}
|
||||
|
||||
|
@ -1290,7 +1296,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescri
|
|||
}
|
||||
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
#ifdef DEBUG_DECODER
|
||||
w->Value().SetValue((ElemType) 0.01);
|
||||
#endif
|
||||
|
@ -1349,7 +1355,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDes
|
|||
if (m_lookupTableOrder > 0)
|
||||
{
|
||||
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
|
||||
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
output = builder.LookupTable(e, input, L"LookupTable");
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
|
@ -1381,8 +1387,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDes
|
|||
else
|
||||
{
|
||||
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
|
||||
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
|
||||
m_net->InitLearnableParameters(b, L"fixedValue", 0);
|
||||
output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
|
||||
}
|
||||
|
||||
|
@ -1407,14 +1414,14 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDes
|
|||
// e.g., [200 x 10000], where 10000 is the vocabulary size
|
||||
// this is for speed-up issue as per word matrix can be simply obtained using column slice
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
// the label is a dense matrix. each element is the word index
|
||||
label = builder.CreateInputNode(L"labels", 2 * (this->nce_noises + 1));
|
||||
|
||||
bias = builder.CreateLearnableParameter(L"BiasVector", 1, m_layerSizes[m_layerSizes.size() - 1]);
|
||||
bias->Value().SetValue((ElemType) -std::log(m_layerSizes[m_layerSizes.size() - 1]));
|
||||
// m_net->InitLearnableParameters(bias, m_uniformInit, randomSeed++, std::log(m_layerSizes[m_layerSizes.size() - 1])* m_initValueScale);
|
||||
m_net->InitLearnableParameters(bias, L"fixedValue", (ElemType) -std::log(m_layerSizes[m_layerSizes.size() - 1]));
|
||||
// m_net->RandomInitLearnableParameters(bias, m_uniformInit, randomSeed++, std::log(m_layerSizes[m_layerSizes.size() - 1])* m_initValueScale);
|
||||
// clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");
|
||||
|
||||
output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeNCEBasedCrossEntropy", L"EvalNodeNCEBasedCrossEntrpy", bias);
|
||||
|
@ -1525,10 +1532,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(co
|
|||
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
|
||||
|
||||
w = builder.CreateLearnableParameter(nameOfW, wts.GetNumRows(), wts.GetNumCols());
|
||||
w->Value().SetValue(wts);
|
||||
m_net->InitLearnableParameters(w, L"fixedValue", 0); // follow protocol
|
||||
w->Value().SetValue(wts); // and overwrite
|
||||
|
||||
b = builder.CreateLearnableParameter(nameOfB, bias.GetNumRows(), 1);
|
||||
b->Value().SetValue(bias);
|
||||
m_net->InitLearnableParameters(b, L"fixedValue", 0); // follow protocol
|
||||
b->Value().SetValue(bias); // and overwrite
|
||||
|
||||
if (layerType == "perceptron")
|
||||
{
|
||||
|
@ -1588,8 +1597,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(co
|
|||
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
|
||||
|
||||
w = builder.CreateLearnableParameter(nameOfW, outputLayerSize, penultimateSize);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
b = builder.CreateLearnableParameter(nameOfB, outputLayerSize, 1);
|
||||
m_net->InitLearnableParameters(b, L"fixedValue", 0);
|
||||
output = builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus);
|
||||
m_net->RenameNode(output, L"HLast");
|
||||
|
||||
|
|
|
@ -53,7 +53,6 @@ public:
|
|||
|
||||
__declspec_noreturn static inline void EvaluationError(const wstring &msg, TextLocation where)
|
||||
{
|
||||
//Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
|
||||
throw EvaluationException(msg, where);
|
||||
}
|
||||
|
||||
|
@ -536,8 +535,13 @@ static ConfigValuePtr Evaluate(const ExpressionPtr &e, const IConfigRecordPtr &s
|
|||
}
|
||||
return ConfigValuePtr(make_shared<ConfigLambda>(move(paramNames), move(namedParams), f), MakeFailFn(e->location), exprPath);
|
||||
}
|
||||
else if (e->op == L"(") // === apply a function to its arguments
|
||||
else if (e->op == L"(" || e->op == L"{") // === apply a function to its arguments
|
||||
{
|
||||
// Note: "{" is experimental and currently ignored as a distinction. To do it more completely, we need
|
||||
// - remember how a function was declared (currently not possible for lambdas)
|
||||
// - make sure the invocation matches declaration
|
||||
// - disallow calling Parameter() or any other creating functions as "()"
|
||||
// - disallow calling "{}"-declared functions from inside a "()"
|
||||
let &lambdaExpr = e->args[0]; // [0] = function
|
||||
let &argsExpr = e->args[1]; // [1] = arguments passed to the function ("()" expression of expressions)
|
||||
let lambda = AsPtr<ConfigLambda>(Evaluate(lambdaExpr, scope, exprPath, L"" /*macros are not visible in expression names*/), lambdaExpr, L"function");
|
||||
|
@ -848,8 +852,8 @@ static wstring FormatConfigValue(ConfigValuePtr arg, const wstring &how)
|
|||
{
|
||||
let arr = arg.AsPtr<ConfigArray>();
|
||||
wstring result;
|
||||
let range = arr->GetIndexRange();
|
||||
for (int i = range.first; i <= range.second; i++)
|
||||
let range = arr->GetIndexBeginEnd();
|
||||
for (int i = range.first; i < range.second; i++)
|
||||
{
|
||||
if (i > range.first)
|
||||
result.append(L"\n");
|
||||
|
@ -890,20 +894,20 @@ public:
|
|||
else // otherwise expect an array
|
||||
{
|
||||
let & arr = arg.AsRef<ConfigArray>();
|
||||
let range = arr.GetIndexRange();
|
||||
us = (double)(range.second + 1 - range.first);
|
||||
let range = arr.GetSize(arg.GetFailFn());
|
||||
us = (double)range;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (what == L"Mod" || what == L"IntDiv") //two-arg int functions
|
||||
else if (what == L"Mod" || what == L"IntDiv") // two-arg int functions
|
||||
{
|
||||
let argsArg = config[L"args"];
|
||||
let& args = argsArg.AsRef<ConfigArray>();
|
||||
auto range = args.GetIndexRange();
|
||||
if (range.second != range.first + 1)
|
||||
auto range = args.GetIndexBeginEnd();
|
||||
if (range.second != range.first + 2)
|
||||
argsArg.Fail(L"Mod/IntDiv expects two arguments");
|
||||
let arg1 = (int)args.At(range.first);
|
||||
let arg2 = (int)args.At(range.second);
|
||||
let arg2 = (int)args.At(range.first + 1);
|
||||
|
||||
if (what == L"Mod")
|
||||
us = (int)(arg1 % arg2);
|
||||
|
@ -918,6 +922,7 @@ public:
|
|||
|
||||
// CompareFunctions
|
||||
// - IsSameObject()
|
||||
// - IsArray()
|
||||
class CompareFunction : public BoxOf<Bool>
|
||||
{
|
||||
public:
|
||||
|
@ -932,13 +937,17 @@ public:
|
|||
if (what == L"IsSameObject")
|
||||
{
|
||||
let& args = argsArg.AsRef<ConfigArray>();
|
||||
auto range = args.GetIndexRange();
|
||||
if (range.second != range.first+1)
|
||||
auto range = args.GetIndexBeginEnd();
|
||||
if (range.second != range.first + 2)
|
||||
argsArg.Fail(L"IsSameObject expects two arguments");
|
||||
let arg1 = args.At(range.first ).AsPtr<Object>();
|
||||
let arg2 = args.At(range.second).AsPtr<Object>();
|
||||
let arg1 = args.At(range.first ).AsPtr<Object>();
|
||||
let arg2 = args.At(range.first + 1).AsPtr<Object>();
|
||||
us = arg1.get() == arg2.get();
|
||||
}
|
||||
else if (what == L"IsArray")
|
||||
{
|
||||
us = argsArg.Is<ConfigArray>();
|
||||
}
|
||||
else
|
||||
whatArg.Fail(L"Unknown 'what' value to CompareFunction: " + what);
|
||||
}
|
||||
|
|
|
@ -22,6 +22,4 @@ ConfigValuePtr Evaluate(ExpressionPtr); // evaluat
|
|||
void Do(ExpressionPtr e); // evaluate e.do
|
||||
shared_ptr<Object> EvaluateField(ExpressionPtr e, const wstring& id); // for experimental CNTK integration
|
||||
|
||||
// some simple tests
|
||||
void SomeTests();
|
||||
} } } // end namespaces
|
||||
}}} // end namespaces
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include <set>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <iomanip>
|
||||
|
||||
#ifndef let
|
||||
#define let const auto
|
||||
|
@ -89,9 +90,18 @@ struct Issue
|
|||
// Because it is often hard to recognize an issue only from the point where it occurred, we also report the history in compact visual form.
|
||||
// Since often multiple contexts are on the same source line, we only print each source line once in a consecutive row of contexts.
|
||||
/*static*/ void TextLocation::PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
|
||||
{
|
||||
wstring error = CreateIssueMessage(locations, errorKind, kind, what);
|
||||
fprintf(stderr, "%ls", error.c_str());
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
/*static*/ wstring TextLocation::CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
|
||||
{
|
||||
vector<Issue> issues; // tracing the error backwards
|
||||
size_t symbolIndex = 0;
|
||||
wstring message;
|
||||
|
||||
for (size_t n = 0; n < locations.size(); n++)
|
||||
{
|
||||
let& location = locations[n];
|
||||
|
@ -125,20 +135,23 @@ struct Issue
|
|||
if (!locations.empty()) // (be resilient to some throwers not having a TextLocation; to be avoided)
|
||||
{
|
||||
let& firstLoc = issues.front().location;
|
||||
fprintf(stderr, "[CALL STACK]\n");
|
||||
message += wstrprintf(L"[CALL STACK]\n");
|
||||
for (auto i = issues.rbegin(); i != issues.rend(); i++)
|
||||
{
|
||||
let& issue = *i;
|
||||
auto& where = issue.location;
|
||||
const auto& lines = where.GetSourceFile().lines;
|
||||
const auto line = (where.lineNo == lines.size()) ? L"(end)" : lines[where.lineNo].c_str();
|
||||
fprintf(stderr, " %ls\n %ls\n", line, issue.markup.c_str());
|
||||
message += wstrprintf(L" %ls\n %ls\n", line, issue.markup.c_str());
|
||||
}
|
||||
fprintf(stderr, "%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
|
||||
message += wstrprintf(L"%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
|
||||
}
|
||||
else
|
||||
fprintf(stderr, "%ls while %ls", errorKind, kind);
|
||||
fprintf(stderr, ": %ls\n", what), fflush(stderr);
|
||||
{
|
||||
message += wstrprintf(L"%ls while %ls", errorKind, kind);
|
||||
}
|
||||
message += wstrprintf(L": %ls\n", what);
|
||||
return message;
|
||||
}
|
||||
/*static*/ vector<SourceFile> TextLocation::sourceFileMap;
|
||||
|
||||
|
@ -286,7 +299,7 @@ public:
|
|||
};
|
||||
punctuations = set<wstring>{
|
||||
L"=", L";", L",", L"\n",
|
||||
L"[", L"]", L"(", L")",
|
||||
L"[", L"]", L"(", L")", L"{", L"}", L"[|", L"|]",
|
||||
L"+", L"-", L"*", L"/", L"**", L".*", L"%", L"||", L"&&", L"^",
|
||||
L"!",
|
||||
L"==", L"!=", L"<", L"<=", L">", L">=",
|
||||
|
@ -557,37 +570,43 @@ public:
|
|||
// ---------------------------------------------------------------------------
|
||||
|
||||
// diagnostics helper: print the content
|
||||
void Expression::Dump(int indent) const
|
||||
void Expression::DumpToStream(wstringstream & treeStream, int indent)
|
||||
{
|
||||
fprintf(stderr, "%*s", indent, "");
|
||||
treeStream << std::setfill(L' ') << std::setw(indent) << L" ";
|
||||
treeStream << std::setw(0);
|
||||
|
||||
if (op == L"s")
|
||||
fprintf(stderr, "'%ls' ", s.c_str());
|
||||
treeStream << "'" << s.c_str() << "'";
|
||||
else if (op == L"d")
|
||||
fprintf(stderr, "%.f ", d);
|
||||
treeStream << std::fixed << std::setprecision(0) << d;
|
||||
else if (op == L"b")
|
||||
fprintf(stderr, "%s ", b ? "true" : "false");
|
||||
treeStream << b ? "true" : "false";
|
||||
else if (op == L"id")
|
||||
fprintf(stderr, "%ls ", id.c_str());
|
||||
treeStream << id.c_str();
|
||||
else if (op == L"new" || op == L"array" || op == L".")
|
||||
fprintf(stderr, "%ls %ls ", op.c_str(), id.c_str());
|
||||
treeStream << op.c_str() << " " << id.c_str();
|
||||
else
|
||||
fprintf(stderr, "%ls ", op.c_str());
|
||||
treeStream << op.c_str();
|
||||
|
||||
if (!args.empty())
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
treeStream << std::endl;
|
||||
for (const auto& arg : args)
|
||||
arg->Dump(indent + 2);
|
||||
{
|
||||
arg->DumpToStream(treeStream, indent + 1);
|
||||
}
|
||||
}
|
||||
if (!namedArgs.empty())
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
treeStream << std::endl;
|
||||
for (const auto& arg : namedArgs)
|
||||
{
|
||||
fprintf(stderr, "%*s%ls =\n", indent + 2, "", arg.first.c_str());
|
||||
arg.second.second->Dump(indent + 4);
|
||||
treeStream << std::setfill(L' ') << std::setw(indent + 1) << L"";
|
||||
treeStream << arg.first.c_str() << L" =" << std::endl;
|
||||
arg.second.second->DumpToStream(treeStream, indent + 2);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
treeStream << std::endl;
|
||||
}
|
||||
|
||||
class Parser : public Lexer
|
||||
|
@ -647,13 +666,15 @@ class Parser : public Lexer
|
|||
return id;
|
||||
}
|
||||
|
||||
map<wstring, int> infixPrecedence; // precedence level of infix operators
|
||||
map<wstring, int> infixPrecedence; // precedence level of infix operators
|
||||
static const int unaryPrecedence = 90; // for unary "-" and "!". 90 is below x., x[, x(, and x{, but above all others
|
||||
// TODO: Would be more direct to fold this into the table below as well.
|
||||
public:
|
||||
Parser(SourceFile&& sourceFile, vector<wstring>&& includePaths)
|
||||
: Lexer(move(includePaths))
|
||||
{
|
||||
infixPrecedence = map<wstring, int>{
|
||||
{L".", 99}, {L"[", 99}, {L"(", 99}, // also sort-of infix operands...
|
||||
{L".", 99}, {L"[", 99}, {L"(", 99}, {L"{", 99}, // (with LHS) these are also sort-of infix operands...
|
||||
{L"*", 10}, {L"/", 10}, {L".*", 10}, {L"**", 10}, {L"%", 10},
|
||||
{L"+", 9}, {L"-", 9}, {L"with", 9}, {L"==", 8},
|
||||
{L"!=", 8}, {L"<", 8}, {L"<=", 8}, {L">", 8}, {L">=", 8},
|
||||
|
@ -700,7 +721,7 @@ public:
|
|||
{
|
||||
operand = make_shared<Expression>(tok.beginLocation, tok.symbol + L"("); // encoded as +( -( !(
|
||||
ConsumeToken();
|
||||
operand->args.push_back(ParseExpression(100, stopAtNewline));
|
||||
operand->args.push_back(ParseExpression(unaryPrecedence, stopAtNewline));
|
||||
}
|
||||
else if (tok.symbol == L"new") // === new class instance
|
||||
{
|
||||
|
@ -723,13 +744,34 @@ public:
|
|||
operand = ParseExpression(0, false /*go across newlines*/); // just return the content of the parens (they do not become part of the expression tree)
|
||||
ConsumePunctuation(L")");
|
||||
}
|
||||
else if (tok.symbol == L"[") // === dictionary constructor
|
||||
else if (tok.symbol == L"{" || tok.symbol == L"["/*soon to be deprecated*/) // === record constructor
|
||||
{
|
||||
let* closeSymbol = tok.symbol == L"{" ? L"}" : L"]";
|
||||
operand = make_shared<Expression>(tok.beginLocation, L"[]");
|
||||
ConsumeToken();
|
||||
operand->namedArgs = ParseRecordMembers();
|
||||
ConsumePunctuation(L"]");
|
||||
ConsumePunctuation(closeSymbol);
|
||||
}
|
||||
#if 1 // the F# syntax is a stop-gap and meant for experimentation, and we will not recommend to use it
|
||||
// Rather, we must find a way to parse both Python-like array literals and BS dictionaries jointly,
|
||||
// and eventually deprecate [] for records.
|
||||
else if (tok.symbol == L"[|") // === array literal using F# syntax [| a; b; c |] (same as a:b:c, but also allows for 0- and 1-element arrays)
|
||||
{
|
||||
operand = make_shared<Expression>(tok.beginLocation, L":");
|
||||
ConsumeToken();
|
||||
if (GotToken().symbol != L"|]") // {} defines an empty array
|
||||
{
|
||||
for (;;)
|
||||
{
|
||||
operand->args.push_back(ParseExpression(0, false)); // item. Precedence 0 means go until comma or closing parenthesis.
|
||||
if (GotToken().symbol != L";")
|
||||
break;
|
||||
ConsumeToken();
|
||||
}
|
||||
}
|
||||
ConsumePunctuation(L"|]");
|
||||
}
|
||||
#endif
|
||||
else if (tok.symbol == L"array") // === array constructor
|
||||
{
|
||||
operand = OperandFromTokenSymbol(tok);
|
||||
|
@ -780,18 +822,18 @@ public:
|
|||
if (left->op != L"id") // currently only allow for a single argument
|
||||
Expected(L"identifier");
|
||||
ConsumeToken();
|
||||
let macroArgs = make_shared<Expression>(left->location, L"()", left); // wrap identifier in a '()' macro-args expression
|
||||
let macroArgs = make_shared<Expression>(left->location, L"()", left); // wrap identifier in a "()" macro-args expression
|
||||
// TODO: test parsing of i => j => i*j
|
||||
let body = ParseExpression(opPrecedence, stopAtNewline); // pass same precedence; this makes '=>' right-associative e.g.i=>j=>i*j
|
||||
operation->args[0] = macroArgs; // [0]: parameter list
|
||||
operation->args.push_back(body); // [1]: right operand
|
||||
}
|
||||
else if (op == L"(") // === macro application
|
||||
else if (op == L"(" || op == L"{") // === macro application
|
||||
{
|
||||
// op = "(" means 'apply'
|
||||
// op = "(" and "{" mean 'apply', where {} refers to experimental constructor syntax
|
||||
// args[0] = lambda expression (lambda: op="=>", args[0] = param list, args[1] = expression with unbound vars)
|
||||
// args[1] = arguments (arguments: op="(), args=vector of expressions, one per arg; and namedArgs)
|
||||
operation->args.push_back(ParseMacroArgs(false)); // [1]: all arguments
|
||||
// args[1] = arguments (arguments: op="()", args=vector of expressions, one per arg; and namedArgs)
|
||||
operation->args.push_back(ParseMacroArgs(false, op)); // [1]: all arguments
|
||||
}
|
||||
else if (op == L"[") // === array index
|
||||
{
|
||||
|
@ -829,11 +871,12 @@ public:
|
|||
// In case of macro definition, all arguments must be of type "id". Pass 'defining' to check for that.
|
||||
// namedArgs = dictionary of optional args
|
||||
// In case of macro definition, dictionary values are default values that are used if the argument is not given
|
||||
ExpressionPtr ParseMacroArgs(bool defining)
|
||||
ExpressionPtr ParseMacroArgs(bool defining, wstring openSymbol)
|
||||
{
|
||||
ConsumePunctuation(L"(");
|
||||
ConsumePunctuation(openSymbol.c_str());
|
||||
auto macroArgs = make_shared<Expression>(GotToken().beginLocation, L"()");
|
||||
if (GotToken().symbol != L")") // x() defines an empty argument list
|
||||
let* closeSymbol = openSymbol == L"(" ? L")" : L"}";
|
||||
if (GotToken().symbol != closeSymbol) // x() defines an empty argument list
|
||||
{
|
||||
for (;;)
|
||||
{
|
||||
|
@ -856,7 +899,7 @@ public:
|
|||
ConsumeToken();
|
||||
}
|
||||
}
|
||||
ConsumePunctuation(L")");
|
||||
ConsumePunctuation(closeSymbol);
|
||||
return macroArgs;
|
||||
}
|
||||
map<wstring, pair<TextLocation, ExpressionPtr>> ParseRecordMembers()
|
||||
|
@ -865,7 +908,7 @@ public:
|
|||
// member identifier -> expression
|
||||
// Macro declarations are translated into lambdas, e.g.
|
||||
// F(A,B) = expr(A,B)
|
||||
// gets represented in the dictionary as
|
||||
// (and likewise F{A,B}) gets represented in the dictionary as
|
||||
// F = (A,B) => expr(A,B)
|
||||
// where a lambda expression has this structure:
|
||||
// op="=>"
|
||||
|
@ -897,7 +940,8 @@ public:
|
|||
ConsumePunctuation(L"]");
|
||||
}
|
||||
// optional macro args
|
||||
let parameters = (GotToken().symbol == L"(") ? ParseMacroArgs(true /*defining*/) : ExpressionPtr(); // optionally, macro arguments
|
||||
let& openParen = GotToken().symbol;
|
||||
let parameters = (openParen == L"(" || openParen == L"{") ? ParseMacroArgs(true /*defining*/, openParen) : ExpressionPtr(); // optionally, macro arguments
|
||||
ConsumePunctuation(L"=");
|
||||
auto rhs = ParseExpression(0, true /*can end at newline*/); // and the right-hand side
|
||||
// if macro then rewrite it as an assignment of a lambda expression
|
||||
|
@ -907,7 +951,8 @@ public:
|
|||
if (arrayIndexExpr)
|
||||
{
|
||||
// create a lambda expression over the index variable
|
||||
let macroArgs = make_shared<Expression>(arrayIndexExpr->location, L"()", arrayIndexExpr); // wrap identifier in a '()' macro-args expression
|
||||
// BUGBUG: For {} constructor functions--we cannot declare constructor lambdas for now.
|
||||
let macroArgs = make_shared<Expression>(arrayIndexExpr->location, L"()", arrayIndexExpr); // wrap identifier in a "()" macro-args expression
|
||||
let initLambdaExpr = make_shared<Expression>(arrayIndexExpr->location, L"=>", macroArgs, rhs); // [0] is id, [1] is body
|
||||
rhs = make_shared<Expression>(location, L"array");
|
||||
rhs->args.push_back(fromExpr); // [0] first index
|
||||
|
@ -939,12 +984,6 @@ public:
|
|||
topDict->namedArgs = topMembers;
|
||||
return topDict;
|
||||
}
|
||||
// simple test function for use during development
|
||||
static void Test()
|
||||
{
|
||||
let parserTest = L"a=1\na1_=13;b=2 // cmt\ndo = (print\n:train:eval) ; x = array[1..13] (i=>1+i*print.message==13*42) ; print = new PrintAction [ message = 'Hello World' ]";
|
||||
ParseConfigDictFromString(parserTest, L"Test", vector<wstring>())->Dump();
|
||||
}
|
||||
};
|
||||
|
||||
// globally exported functions to execute the parser
|
||||
|
|
|
@ -37,6 +37,7 @@ struct TextLocation // position in the text. Lightweight value struct that we ca
|
|||
|
||||
// helpers for pretty-printing errors: Show source-code line with ...^ under it to mark up the point of error
|
||||
static void PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
|
||||
static std::wstring CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
|
||||
static void Trace(TextLocation, const wchar_t* traceKind, const wchar_t* op, const wchar_t* exprPath);
|
||||
|
||||
// construction
|
||||
|
@ -77,8 +78,12 @@ public:
|
|||
} // where the error happened
|
||||
virtual const wchar_t* kind() const = 0; // e.g. "warning" or "error"
|
||||
|
||||
wstring GetError(const std::wstring& linePrefix) const override
|
||||
{
|
||||
return TextLocation::CreateIssueMessage(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
|
||||
}
|
||||
// pretty-print this as an error message
|
||||
void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const
|
||||
void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const override
|
||||
{
|
||||
TextLocation::PrintIssue(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
|
||||
}
|
||||
|
@ -129,7 +134,7 @@ struct Expression
|
|||
args.push_back(arg2);
|
||||
}
|
||||
// diagnostics helper: print the content
|
||||
void Dump(int indent = 0) const;
|
||||
void DumpToStream(wstringstream & treeStream, int indent = 0);
|
||||
};
|
||||
typedef Expression::ExpressionPtr ExpressionPtr; // circumvent some circular definition problem
|
||||
|
||||
|
|
|
@ -1,194 +0,0 @@
|
|||
// BrainScriptTest.cpp -- some tests
|
||||
|
||||
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
|
||||
|
||||
#include "Basics.h"
|
||||
#include "BrainScriptEvaluator.h"
|
||||
#include "BrainScriptParser.h"
|
||||
|
||||
#ifndef let
|
||||
#define let const auto
|
||||
#endif
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace BS {
|
||||
|
||||
using namespace std;
|
||||
using namespace msra::strfun;
|
||||
|
||||
// Note: currently this seems to be the master copy; got to check whether the other one was also changed
|
||||
|
||||
//extern wstring standardFunctions, computationNodes, commonMacros;
|
||||
|
||||
#if 1 // TODO: these may be newer, merge into Experimentalthingy
|
||||
|
||||
static wstring standardFunctions =
|
||||
L"Print(value, format='') = new PrintAction [ what = value /*; how = format*/ ] \n"
|
||||
L"Fail(msg) = new FailAction [ what = msg ] \n"
|
||||
L"RequiredParameter(message) = Fail('RequiredParameter: ' + message) \n"
|
||||
L"Format(value, format) = new StringFunction [ what = 'Format' ; arg = value ; how = format ] \n"
|
||||
L"Replace(s, from, to) = new StringFunction [ what = 'Replace' ; arg = s ; replacewhat = from ; withwhat = to ] \n"
|
||||
L"Substr(s, begin, num) = new StringFunction [ what = 'Substr' ; arg = s ; pos = begin ; chars = num ] \n"
|
||||
L"Chr(c) = new StringFunction [ what = 'Chr' ; arg = c ] \n"
|
||||
L"Floor(x) = new NumericFunction [ what = 'Floor' ; arg = x ] \n"
|
||||
L"Length(x) = new NumericFunction [ what = 'Length' ; arg = x ] \n"
|
||||
L"Ceil(x) = -Floor(-x) \n"
|
||||
L"Round(x) = Floor(x+0.5) \n"
|
||||
L"Abs(x) = if x >= 0 then x else -x \n"
|
||||
L"Sign(x) = if x > 0 then 1 else if x < 0 then -1 else 0 \n"
|
||||
L"Min(a,b) = if a < b then a else b \n"
|
||||
L"Max(a,b) = if a > b then a else b \n"
|
||||
L"Fac(n) = if n > 1 then Fac(n-1) * n else 1 \n";
|
||||
|
||||
static wstring computationNodes = // BUGBUG: optional args not working yet, some scope problem causing a circular reference
|
||||
L"Mean(z, tag='') = new ComputationNode [ class = 'MeanNode' ; inputs = z /* ; tag = tag */ ]\n"
|
||||
L"InvStdDev(z, tag='') = new ComputationNode [ class = 'InvStdDevNode' ; inputs = z /* ; tag = tag */ ]\n"
|
||||
L"PerDimMeanVarNormalization(feat,mean,invStdDev, tag='') = new ComputationNode [ class = 'PerDimMeanVarNormalizationNode' ; inputs = feat:mean:invStdDev /* ; tag = tag */ ]\n"
|
||||
L"Parameter(outD, inD, tag='parameter') = new ComputationNode [ class = 'LearnableParameterNode' ; outDim = outD ; inDim = inD /*; tag = tag*/ ]\n"
|
||||
L"Input(dim,tag='features') = Parameter(dim,1,tag=tag) // TODO: for now \n"
|
||||
L"RowSlice(firstRow, rows, features, tag='') = new ComputationNode [ class = 'RowSliceNode' ; inputs = features ; first = firstRow ; num = rows /* ; tag = tag */ ]\n"
|
||||
L"Delay(in, delay, tag='') = new RecurrentComputationNode [ class = 'DelayNode' ; inputs = in ; deltaT = -delay /* ; tag = tag */ ]\n"
|
||||
L"Sigmoid(z, tag='') = new ComputationNode [ class = 'SigmoidNode' ; inputs = z /* ; tag = tag */ ]\n"
|
||||
L"Log(z, tag='') = new ComputationNode [ class = 'LogNode' ; inputs = z /* ; tag = tag */ ]\n"
|
||||
L"CrossEntropyWithSoftmax(labels, outZ, tag='') = new ComputationNode [ class = 'CrossEntropyWithSoftmaxNode' ; inputs = labels:outZ /* ; tag = tag */ ]\n"
|
||||
L"ErrorPrediction(labels, outZ, tag='') = new ComputationNode [ class = 'ErrorPredictionNode' ; inputs = labels:outZ /* ; tag = tag */ ]\n";
|
||||
|
||||
static wstring commonMacros = // TODO: rename rows and cols to inDim and outDim or vice versa, whichever it is
|
||||
L"BFF(in, rows, cols) = [ B = Parameter(rows, 1/*init = fixedvalue, value = 0*/) ; W = Parameter(rows, cols) ; z = W*in+B ] \n"
|
||||
L"SBFF(in, rows, cols) = [ Eh = Sigmoid(BFF(in, rows, cols).z) ] \n "
|
||||
L"MeanVarNorm(feat) = PerDimMeanVarNormalization(feat, Mean(feat), InvStdDev(feat)) \n"
|
||||
L"LogPrior(labels) = Log(Mean(labels)) \n";
|
||||
|
||||
#endif
|
||||
|
||||
void SomeTests()
|
||||
{
|
||||
try
|
||||
{
|
||||
// collecting all sorts of test cases here
|
||||
const wchar_t* parserTests[] =
|
||||
{
|
||||
L"do = Parameter(13,42) * Input(42) + Parameter(13,1)",
|
||||
L"do = Print(array [1..10] (i=>i*i))",
|
||||
L"do = new PrintAction [ what = 'abc' ]",
|
||||
L"do = Print(new StringFunction [ x = 13 ; y = 42 ; what = 'Format' ; how = '.2' ; arg = x*y ])",
|
||||
L"do = Print(\"new StringFunction [ what = 'Format' ; how = '.2' ; arg = '13 > 42' ]\")",
|
||||
L"do = new PrintAction [ what = if 13 > 42 || 12 > 1 then 'Hello World' + \"!\" else 'Oops?']",
|
||||
L"i2s(i) = new StringFunction [ what = 'Format' ; arg = i ; how = '.2' ] ; do = Print('result=' + i2s((( [ v = (i => i + delta) ].v(5)))+13)) ; delta = 42 ",
|
||||
L"do = Print(1+2*3) : Print('hello'+' world')",
|
||||
L"do = Print(Format( (13:(fortytwo:1):100), '')) ; fortytwo=42 ",
|
||||
L"do = Print(val) ; val=if !false then 42 else -+-++-13:[a='a';b=42]:+14; arr = array [1..10] (i => 2*i)",
|
||||
L"do = Print(arg) ; N = 5 ; arr = array [1..N] (i => if i < N then arr[i+1]*i else N) ; arg = arr ",
|
||||
L"do = Print(val) ; val = [ v = (i => i + offset) ].v(42) ; offset = 13 ",
|
||||
// #12: DNN with recursion
|
||||
L"do = Print(val) \n"
|
||||
L"val = new NDLComputationNetwork [\n"
|
||||
L" featDim=40*31 ; labelDim=9000 ; hiddenDim=2048 ; numHiddenLayers = 3 \n"
|
||||
L" myFeatures = Input(featDim) ; myLabels = Input(labelDim) \n"
|
||||
L" featNorm = MeanVarNorm(myFeatures) \n"
|
||||
L" HiddenStack(layer) = if layer > 1 then SBFF(HiddenStack(layer - 1).Eh, hiddenDim, hiddenDim) else SBFF(featNorm, hiddenDim, featDim) \n"
|
||||
L" outLayer = BFF(HiddenStack(numHiddenLayers).Eh, labelDim, hiddenDim) \n"
|
||||
L" outZ = outLayer.z \n"
|
||||
L" CE = CrossEntropyWithSoftmax(myLabels, outZ) \n"
|
||||
L" Err = ErrorPrediction(myLabels, outZ) \n"
|
||||
L" logPrior = LogPrior(myLabels) \n"
|
||||
L" ScaledLogLikelihood = outZ - logPrior \n"
|
||||
L"]\n",
|
||||
// #13: factorial
|
||||
L"do = Print(fac(5)) ; fac(i) = if i > 1 then fac(i-1)*i else 1 ",
|
||||
// #14: Fibonacci sequence with memoization
|
||||
L"do = Print(fibs(10)) ; fibs(n) = [ vals = array[1..n] (i => if i < 3 then i-1 else vals[i-1]+vals[i-2]) ].vals[n] ",
|
||||
// #15: DNN with array
|
||||
L"do = Print(val) \n"
|
||||
L"val = new NDLComputationNetwork [\n"
|
||||
L" featDim=40*31 ; labelDim=9000 ; hiddenDim=2048 ; numHiddenLayers = 3 \n"
|
||||
L" myFeatures = Input(featDim, tag='features') ; myLabels = Input(labelDim, tag='labels') \n"
|
||||
L" featNorm = MeanVarNorm(myFeatures) \n"
|
||||
L" layers[layer:1..numHiddenLayers] = if layer > 1 then SBFF(layers[layer-1].Eh, hiddenDim, hiddenDim) else SBFF(featNorm, hiddenDim, featDim) \n"
|
||||
L" outLayer = BFF(layers[numHiddenLayers].Eh, labelDim, hiddenDim) \n"
|
||||
L" outZ = outLayer.z + Delay(outZ, 1) \n"
|
||||
L" CE = CrossEntropyWithSoftmax(myLabels, outZ) \n"
|
||||
L" Err = ErrorPrediction(myLabels, outZ) \n"
|
||||
L" logPrior = LogPrior(myLabels) \n"
|
||||
L" ScaledLogLikelihood = outZ - logPrior \n"
|
||||
L"]\n",
|
||||
// #16: windowed RNN
|
||||
L"do = Print(val) \n"
|
||||
L"val = new NDLComputationNetwork [ \n"
|
||||
L" hiddenDim = 512 \n"
|
||||
L" numHiddenLayers = 2 \n"
|
||||
L" T = 3 // total context window \n"
|
||||
L" \n"
|
||||
L" // data sources \n"
|
||||
L" featDim = 40 ; labelDim = 9000 \n"
|
||||
L" myFeatures = Input(featDim) ; myLabels = Input(labelDim) \n"
|
||||
L" \n"
|
||||
L" // split the augmented input vector into individual frame vectors \n"
|
||||
L" subframes[t:0..T - 1] = RowSlice(t * featDim, featDim, myFeatures) \n"
|
||||
L" \n"
|
||||
L" // hidden layers \n"
|
||||
L" layers[layer:1..numHiddenLayers] = [ // each layer stores a dict that stores its hidden fwd and bwd state vectors \n"
|
||||
L" // model parameters \n"
|
||||
L" W_fwd = Parameter(hiddenDim, featDim) // Parameter(outdim, indim) \n"
|
||||
L" W_bwd = if layer > 1 then Parameter(hiddenDim, hiddenDim) else Fail('no W_bwd') // input-to-hidden \n"
|
||||
L" H_fwd = Parameter(hiddenDim, hiddenDim) // hidden-to-hidden \n"
|
||||
L" H_bwd = Parameter(hiddenDim, hiddenDim) \n"
|
||||
L" b = Parameter(hiddenDim, 1) // bias \n"
|
||||
L" // shared part of activations (input connections and bias) \n"
|
||||
L" z_shared[t:0..T-1] = (if layer > 1 \n"
|
||||
L" then W_fwd * layers[layer - 1].h_fwd[t] + W_bwd * layers[layer - 1].h_bwd[t] \n"
|
||||
L" else W_fwd * subframes[t] \n"
|
||||
L" ) + b \n"
|
||||
L" // recurrent part and non-linearity \n"
|
||||
L" step(H, h, dt, t) = Sigmoid(if (t + dt >= 0 && t + dt < T) \n"
|
||||
L" then z_shared[t] + H * h[t + dt] \n"
|
||||
L" else z_shared[t]) \n"
|
||||
L" h_fwd[t:0..T-1] = step(H_fwd, h_fwd, -1, t) \n"
|
||||
L" h_bwd[t:0..T-1] = step(H_bwd, h_bwd, 1, t) \n"
|
||||
L" ] \n"
|
||||
L" // output layer --linear only at this point; Softmax is applied later \n"
|
||||
L" outLayer = [ \n"
|
||||
L" // model parameters \n"
|
||||
L" W_fwd = Parameter(labelDim, hiddenDim) \n"
|
||||
L" W_bwd = Parameter(labelDim, hiddenDim) \n"
|
||||
L" b = Parameter(labelDim, 1) \n"
|
||||
L" // output \n"
|
||||
L" topHiddenLayer = layers[numHiddenLayers] \n"
|
||||
L" centerT = Floor(T/2) \n"
|
||||
L" z = W_fwd * topHiddenLayer.h_fwd[centerT] + W_bwd * topHiddenLayer.h_bwd[centerT] + b \n"
|
||||
L" ] \n"
|
||||
L" outZ = outLayer.z // we only want this one & don't care about the rest of this dictionary \n"
|
||||
L" \n"
|
||||
L" // define criterion nodes \n"
|
||||
L" CE = CrossEntropyWithSoftmax(myLabels, outZ) \n"
|
||||
L" Err = ErrorPrediction(myLabels, outZ) \n"
|
||||
L" \n"
|
||||
L" // define output node for decoding \n"
|
||||
L" logPrior = LogPrior(myLabels) \n"
|
||||
L" ScaledLogLikelihood = outZ - logPrior // before: Minus(CE.BFF.FF.P,logPrior,tag=Output) \n"
|
||||
L"]\n",
|
||||
L" \n" // this fails because dict is outside val; expression name is not local to it
|
||||
L"do = Print(val) \n"
|
||||
L"dict = [ outY = Input(13) ] ; val = new NDLComputationNetwork [ outZ = dict.outY \n"
|
||||
L"]\n",
|
||||
L"f(x,option='default') = Print(option); do = f(42,option='value')",
|
||||
NULL};
|
||||
let first = 0; // 0 for all
|
||||
bool oneOnly = first > 0;
|
||||
for (size_t i = first; parserTests[i]; i++)
|
||||
{
|
||||
fprintf(stderr, "\n### Test %d ###\n\n", (int) i), fflush(stderr);
|
||||
let parserTest = parserTests[i];
|
||||
let expr = ParseConfigDictFromString(standardFunctions + computationNodes + commonMacros + parserTest, L"Test", vector<wstring>());
|
||||
//expr->Dump();
|
||||
Do(expr);
|
||||
if (oneOnly)
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (const ConfigException& err)
|
||||
{
|
||||
err.PrintError(L"error");
|
||||
}
|
||||
}
|
||||
|
||||
}}} // namespaces
|
|
@ -6,24 +6,175 @@
|
|||
//
|
||||
|
||||
##############################################################################
|
||||
# standard functions
|
||||
# Layer constructors
|
||||
#
|
||||
# A layer constructor is a stateful function that creates and returns an instance
|
||||
# of a 'learnable function'. A learnable function is a function object that has
|
||||
# learnable parameters baked into it, which get trained by SGD.
|
||||
# Calling a layer constructor twice creates two instances with independent parameters.
|
||||
#
|
||||
# Learnable function instances can be applied to data or composed directly into
|
||||
# more complex models. For example:
|
||||
# // immediate usage:
|
||||
# z = LinearLayer{9000}(h) # LinearLayer{9000} returns a new function object
|
||||
# // composing multiple layers into a model
|
||||
# model = Sequential ( DenseLayer{2048, activation=Sigmoid} : LinearLayer {9000} )
|
||||
# z = model (features)
|
||||
# // applying the same model to two inputs, with shared, jointly updated parameters
|
||||
# f = DenseLayer{2048, activation=ReLU}
|
||||
# z1 = f (feat1) ; z2 = f (feat2)
|
||||
# The names are intentionally kept similar to other toolkits.
|
||||
#
|
||||
# Note that functions without parameters can be used as layers directly, e.g. Sigmoid.
|
||||
##############################################################################
|
||||
|
||||
Print(value, format='') = new PrintAction [ what = value /*; how = format*/ ]
|
||||
Fail(what) = new FailAction [ /*what*/ ]
|
||||
Format(value, format) = new StringFunction [ what = 'Format' ; arg = value ; how = format ]
|
||||
Replace(s, from, to) = new StringFunction [ what = 'Replace' ; arg = s ; replacewhat = from ; withwhat = to ]
|
||||
Substr(s, begin, num) = new StringFunction [ what = 'Substr' ; arg = s ; pos = begin ; chars = num ]
|
||||
Chr(c) = new StringFunction [ what = 'Chr' ; arg = c ]
|
||||
Length(x) = new NumericFunction [ what = 'Length' ; arg = x ]
|
||||
Sign(x) = if x > 0 then 1 else if x < 0 then -1 else 0
|
||||
Min(a,b) = if a < b then a else b
|
||||
Max(a,b) = if a > b then a else b
|
||||
Fac(n) = if n > 1 then Fac(n-1) * n else 1
|
||||
IsSameObject(a,b) = new CompareFunction [ what = 'IsSameObject' ; args = (a : b) ]
|
||||
Mod(x, y) = new NumericFunction [ what = 'Mod' ; args = (x:y) ]
|
||||
IntDiv(x, y) = new NumericFunction [ what = 'IntDiv' ; args = (x:y) ]
|
||||
# LinearLayer -- create a fully-connected linear projection layer
|
||||
# Note: outDim may describe a tensor as well.
|
||||
LinearLayer {outDim} =
|
||||
{
|
||||
W = ParameterTensor {_ConcatArrays (outDim, 0), init='uniform'}
|
||||
b = ParameterTensor {outDim, initValue=0}
|
||||
outRank = Length (_AsArray (outDim)) # support outputs with tensor layouts
|
||||
f(x) = Times (W, x, outputRank = outRank) + b
|
||||
}.f
|
||||
|
||||
# DenseLayer -- create a fully-connected layer with optional non-linearity
|
||||
DenseLayer{outDim, activation=(x=>x)} = Sequential ( LinearLayer{outDim} : activation )
|
||||
|
||||
# EmbeddingLayer -- create a linear embedding layer
|
||||
EmbeddingLayer {outDim, # dimension of embedding
|
||||
embeddingPath = '', transpose = false} = # load a fixed embedding from a path instead
|
||||
{
|
||||
shape = if transpose then (0 : outDim) else (outDim : 0)
|
||||
E = if embeddingPath == ''
|
||||
then ParameterTensor {shape, init='uniform'} # learnable
|
||||
else ParameterTensor {shape, initFromFilePath = embeddingPath, learningRateMultiplier = 0} # fixed from file
|
||||
TimesOp = if transpose then TransposeTimes else Times
|
||||
f(x) = TimesOp (E, x) # x is expected to be sparse one-hot
|
||||
}.f
|
||||
|
||||
# ConvolutionalLayer -- create a convolution layer with optional non-linearity
|
||||
# [ (shifting dims) | (reduction dim) | (output dim) | (sample dims) ]
|
||||
# in : [ (shifting dims) | (reduction dim) | | (sample dims) ]
|
||||
# kernel : [ (filter dims) | (reduction dim) | (output dim) | ]
|
||||
# out : [ (shifting dims)] | | (output dim) | (sample dims) ]
|
||||
ConvolutionalLayer {numOutputChannels, # e.g. (1) or BS.Constants.None
|
||||
filterShape, # e.g. (3:3)
|
||||
init = "uniform",
|
||||
#reductionRank = 1, # TODO: support this
|
||||
stride = 1, autoPadding = true,
|
||||
#lowerPad = 0, upperPad = 0, # TODO: support this
|
||||
#transpose = false, # TODO: support this
|
||||
maxTempMemSizeInSamples = 0} =
|
||||
{
|
||||
reductionRank = 1 # TODO: shall become an optional parameter
|
||||
outputChannelsShape = Repeat (1, numOutputChannels) # Repeat(1) turns a scalar into a 1-element array
|
||||
outputRank = Length (outputChannelsShape)
|
||||
kernelShape = _ConcatArrays (filterShape, Repeat (reductionRank, 0)) # append reduction dims to filter dims
|
||||
W = ParameterTensor{_ConcatArrays (kernelDims, outputChannelsShape), init=init}
|
||||
autoPaddingPadded = _ConcatArrays (_ForceResizeArray (Length (kernelDims), autoPadding), Repeat (reductionRank, false)) # set padding flags for reduction dims to false
|
||||
sharing = false # TODO: support this
|
||||
f(x) = Convolution (W, x, kernelShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = autoPaddingPadded, lowerPad = lowerPad, upperPad = upperPad, transpose = transpose, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
|
||||
}.f
|
||||
|
||||
# MaxPoolingLayer, AveragePoolingLayer -- create a max- or average-pooling layer
|
||||
_PoolingLayer {poolKind, # "max" or "average"
|
||||
filterShape, # e.g. (3:3)
|
||||
lowerPad = 0, upperPad = 0} = # TODO: support this
|
||||
{
|
||||
f(x) = Pooling (x, poolKind, kernelShape, stride = stride, sharing = sharing, autoPadding = autoPaddingPadded, lowerPad = lowerPad, upperPad = upperPad)
|
||||
}.f
|
||||
MaxPoolingLayer {filterShape, stride = 1, autoPadding = true /*, lowerPad = 0, upperPad = 0*/} =
|
||||
_PoolingLayer {"max", filterShape, stride = stride, autoPadding = autoPadding}
|
||||
AveragePoolingLayer {filterShape, stride = 1, autoPadding = true /*, lowerPad = 0, upperPad = 0*/} =
|
||||
_PoolingLayer {"average", filterShape, stride = stride, autoPadding = autoPadding}
|
||||
|
||||
# RecurrentLSTMLayer -- create an LSTM layer
|
||||
RecurrentLSTMLayer {outDim,
|
||||
cellDim = BS.Constants.None, # if set then use a projection
|
||||
goBackwards = false,
|
||||
enableSelfStabilization = false} =
|
||||
{
|
||||
cellShape = if BS.Constants.IsNone (cellDim) then outDim else cellDim
|
||||
# BUGBUG: Calling f(x) twice will create a second set of parameters. Needs to refactor LSTM for this.
|
||||
f(x) = BS.RNNs.RecurrentLSTMP (outDim, cellDim = cellShape,
|
||||
x, inputDim = 0,
|
||||
previousHook = if goBackwards then BS.RNNs.NextHC else BS.RNNs.PreviousHC,
|
||||
enableSelfStabilization = enableSelfStabilization).h
|
||||
}.f
|
||||
|
||||
# DropoutLayer -- create a drop-out layer
|
||||
DropoutLayer {prob = BS.Constants.None} = if !BS.Constants.IsNone (prob) then Fail ("DropoutLayer: Dropout probability can currently not be specified per-layer.") else
|
||||
{
|
||||
f(x) = Dropout (x)
|
||||
}.f
|
||||
|
||||
# BatchNormalizationLayer -- create a batch-normalization layer
|
||||
BatchNormalizationLayer {spatialRank = 0, # reduce over these dims. E.g. 2 to reduce over (w,h) in a [W x H x C]-shaped input
|
||||
initialScale = 1,
|
||||
normalizationTimeConstant = 0, blendTimeConstant = 0,
|
||||
epsilon = 0.00001, useCntkEngine = true} =
|
||||
{
|
||||
normShape = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
|
||||
scale = ParameterTensor{normShape, initValue = initialScale}
|
||||
bias = ParameterTensor{normShape, initValue = 0}
|
||||
runMean = ParameterTensor{normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
|
||||
runInvStdDev = ParameterTensor{normShape, initValue = 0, learningRateMultiplier = 0}
|
||||
f(x) = BatchNormalization (x, scale, bias, runMean, runInvStdDev, spatialDims > 0, normalizationTimeConstant = normalizationTimeConstant, blendTimeConstant = blendTimeConstant, epsilon = epsilon, useCntkEngine = useCntkEngine)
|
||||
}.f
|
||||
|
||||
# LayerNormalizationLayer -- create a layer-normalization layer
|
||||
LayerNormalizationLayer {dim = BS.Constants.None, initScale = 1, initBias = 0} = if BS.Constants.IsNone (dim) then Fail ("LayerNormalizationLayer: 'dim' parameter is currently required.") else
|
||||
{
|
||||
gain = ParameterTensor{(1), initValue = initScale}
|
||||
bias = ParameterTensor{(1), initValue = initBias}
|
||||
|
||||
f(x) = {
|
||||
div = Constant (1.0 / dim)
|
||||
|
||||
# normalize w.r.t. actual sample statistics
|
||||
mean = div .* ReduceSum (x)
|
||||
x0 = x - mean;
|
||||
std = Sqrt (div .* ReduceSum (x0 .* x0))
|
||||
xHat = ElementDivide (x0, std)
|
||||
|
||||
# denormalize with learned parameters
|
||||
val = xHat .* gain + bias
|
||||
}.val
|
||||
}.f
|
||||
|
||||
# StabilizerLayer -- create a scalar stabilizer [J. Droppo, 2014 -- TODO: get the reference]
|
||||
StabilizerLayer {} =
|
||||
{
|
||||
# BUGBUG: Calling f(x) twice will create a second set of parameters. Needs to refactor Stabilize() for this.
|
||||
f(x) = Stabilize (x)
|
||||
}.f
|
||||
|
||||
# FeatureMVNLayer -- create a corpus-level feature-normalization layer
|
||||
# This can only be applied to features. Statistics are not shared across invocations,
|
||||
# which is semantically OK because the values are the same. However, it is not efficient.
|
||||
FeatureMVNLayer {} = MeanVarNorm
|
||||
|
||||
# Layers that exist in other tools that we will not have:
|
||||
# FlattenLayer{}: Not needed since DenseLayer() can handle tensors just fine.
|
||||
# Activation{}: Not needed since functions can be used directly.
|
||||
|
||||
##############################################################################
|
||||
# Composing layers or models into more more complex models
|
||||
##############################################################################
|
||||
|
||||
# Sequential -- composite that applies a sequence of functions onto an input
|
||||
Sequential (arrayOfFunctions) =
|
||||
{
|
||||
fs = _AsArray (arrayOfFunctions) # make sure it works with a single function that is not an array
|
||||
Apply (x, N) = if N == 0 then x else fs[N-1](Apply (x, N-1)) # we do that recursively
|
||||
f(x) = Apply (x, Length (fs))
|
||||
}.f
|
||||
Merge (arrayOfFunctions, combineFunction) =
|
||||
if Length (arrayOfFunctions) != 2 then Fail ("Merge() is currently limited to binary functions.") else
|
||||
{
|
||||
f(x,y) = combineFunction (arrayOfFunctions[0](x), arrayOfFunctions[1](y))
|
||||
}.f
|
||||
|
||||
##############################################################################
|
||||
# aliases
|
||||
|
@ -51,9 +202,13 @@ Log = CNTK2.Log
|
|||
Minus = CNTK2.Minus
|
||||
Pass = CNTK2.Identity
|
||||
Plus = CNTK2.Plus
|
||||
RectifiedLinear = CNTK2.Relu
|
||||
RectifiedLinear = CNTK2.ReLU # deprecated
|
||||
ReLU = CNTK2.ReLU
|
||||
ReduceSum = CNTK2.ReduceSum
|
||||
ReduceLogSum = CNTK2.ReduceLogSum
|
||||
ReduceMin = CNTK2.ReduceMin
|
||||
ReduceMax = CNTK2.ReduceMax
|
||||
|
||||
Round = CNTK2.Round
|
||||
Sigmoid = CNTK2.Sigmoid
|
||||
|
||||
|
@ -89,7 +244,7 @@ CNTK2 = [
|
|||
// TODO: The API for Parameter is different in current 2.0 design, getting a constant as input for the initial values.
|
||||
// This needs to be fixed to follow the way the Constant() is exposed in Python
|
||||
// Making this an internal node with "_" until we agree on the final interface:
|
||||
_Parameter(shape, value = 0, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]
|
||||
_Parameter(shape, value = 0, initValue = '', learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]
|
||||
|
||||
// 3. Shape operations
|
||||
// Changes: NewReshape -> Reshape, input -> _, dims -> shape
|
||||
|
@ -142,10 +297,12 @@ CNTK2 = [
|
|||
Tanh(_, tag='') = new ComputationNode [ operation = 'Tanh' ; inputs = _ /*plus the function args*/ ]
|
||||
|
||||
// 6. Reductions
|
||||
# the following is a temporary workaround until we have the C++ version
|
||||
ReduceLogSum (_, axis=0, tag='') = if axis != 0 then Fail("ReduceLogSum for now only supports axis=0.")
|
||||
else [ tag1=tag ; axis1=axis ; out = RowSlice (0, 1, _ - LogSoftmax (_), tag=tag1) ].out
|
||||
ReduceSum (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Sum" /*plus the function args*/ ]
|
||||
ReduceSum (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Sum" /*plus the function args*/ ]
|
||||
ReduceLogSum(_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "LogSum" /*plus the function args*/ ]
|
||||
ReduceMin (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Min" /*plus the function args*/ ]
|
||||
ReduceMax (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Max" /*plus the function args*/ ]
|
||||
#ReduceMean (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Mean" /*plus the function args*/ ]
|
||||
|
||||
// 7. Control flow (if, composite etc.)
|
||||
// None so far
|
||||
|
||||
|
@ -158,8 +315,9 @@ CNTK2 = [
|
|||
PastValue(_, shape, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = _ ; shape = new TensorShape [ /*shape*/ ] /*plus the function args*/ ]
|
||||
|
||||
// 10. NN-specific operations
|
||||
// Changes: input -> _, RectifiedLinear -> Relu. [Use Relu to arrive at relu() in snake_case]
|
||||
Relu(_, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = _ /*plus the function args*/ ]
|
||||
// Changes: input -> _, RectifiedLinear -> ReLU
|
||||
ReLU(_, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = _ /*plus the function args*/ ]
|
||||
Relu = ReLU // [Use Relu to arrive at relu() in snake_case]
|
||||
Sigmoid(_, tag='') = new ComputationNode [ operation = 'Sigmoid' ; inputs = _ /*plus the function args*/ ]
|
||||
Softmax(_, tag='') = new ComputationNode [ operation = 'Softmax' ; inputs = _ /*plus the function args*/ ]
|
||||
Dropout(_, tag='') = new ComputationNode [ operation = 'Dropout' ; inputs = _ /*plus the function args*/ ]
|
||||
|
@ -169,6 +327,10 @@ CNTK2 = [
|
|||
// empirical sequence is compared to. Keeping this for now.
|
||||
CrossEntropyWithSoftmax(_, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (_ : outProbVectorSequence) /*plus the function args*/ ]
|
||||
ErrorPrediction(_, outVectorSequence, topN=1, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = if topN == 1 then (_ : outVectorSequence) else (_ : outVectorSequence : Constant (topN)) /*plus the function args*/ ]
|
||||
# TODO: replace with this (need to deal with topN thing):
|
||||
# (_new will be removed once the change is made)
|
||||
CrossEntropyWithSoftmax_new (L, z, tag='') = Minus (ReduceLogSum (z), TransposeTimes (L, z), tag=tag)
|
||||
ErrorPrediction_new (L, z, tag='') = Minus (BS.Constants.One, TransposeTimes (L, Hardmax (z)), tag=tag)
|
||||
|
||||
// 12. Comparison nodes
|
||||
Less(_, y, tag='') = new ComputationNode [ operation = 'Less' ; inputs = (_ : y) /*plus the function args*/ ]
|
||||
|
@ -182,11 +344,21 @@ CNTK2 = [
|
|||
Identity(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _ /*plus the function args*/ ]
|
||||
]
|
||||
|
||||
LearnableParameter (outputDim, inputDim, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
|
||||
Parameter = LearnableParameter // deprecated
|
||||
# Parameter{} can do several forms of initialization. It is no longer required to say 'init="kind"', so we can clean these up a bit.
|
||||
# - initValue=scalar, value=array --> initialize from this value --array form not implemented yet
|
||||
# - initFromFilePath="..." --> read from a data file
|
||||
# - init="uniform|gaussian" (random init scaled by initValueScale). Warning: This has magic scaling factors. TODO: document them here
|
||||
# - init="zero"
|
||||
# deprecated:
|
||||
# - initFromLiteral="..." (deprecated) --> parse a string literal (obsolete with value=array form)
|
||||
# - init="fixedValue", value from 'value'
|
||||
# Warning: Current config will behave unexpected if user mistypes 'initValue' as 'value' (which will be ignored, defaulting to "uniform" init)
|
||||
Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
|
||||
LearnableParameter = Parameter // deprecated
|
||||
# TODO: make Parameter take tensor dims?
|
||||
ParameterTensor(dims, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
|
||||
ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, init = 'fromLiteral', initFromLiteral = literal, learningRateMultiplier = 0.0)
|
||||
ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
|
||||
ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, initFromLiteral = literal, learningRateMultiplier = 0.0)
|
||||
# TODO: Deprecate ConstantFromString() in favor of Constant(array expression)
|
||||
DynamicAxis(tag='') = new ComputationNode [ operation = 'DynamicAxis' ; /*plus the function args*/ ]
|
||||
Input(dims, dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'InputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
|
||||
# TODO: change from dynamicAxis by name to dynamicAxis being an actual object
|
||||
|
@ -195,8 +367,8 @@ ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxi
|
|||
SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]
|
||||
EnvironmentInput(propertyName, tag='') = new ComputationNode [ operation = 'EnvironmentInput' /*plus the function args*/ ]
|
||||
# TODO: make 'dims' the first parameter, think ConstantTensor<dims> (val)
|
||||
ConstantTensor(val, dims, tag='') = ParameterTensor(dims, learningRateMultiplier = 0, init = 'fixedValue', value = val)
|
||||
Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, learningRateMultiplier = 0, init = 'fixedValue', value = val)
|
||||
ConstantTensor(val, dims, tag='') = ParameterTensor(dims, learningRateMultiplier = 0, initValue = val)
|
||||
Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, learningRateMultiplier = 0, initValue = val)
|
||||
PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
|
||||
FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
|
||||
Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]
|
||||
|
@ -227,7 +399,7 @@ WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNo
|
|||
ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileDynamicAxis' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
|
||||
ReconcileMBLayout = ReconcileDynamicAxis # back compat
|
||||
CastAs (type, data) = ReconcileDynamicAxis (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
|
||||
Convolution(weightNode, inputValueNode, kernelDims, mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
|
||||
Convolution(weightNode, inputValueNode, kernelDims, mapDims = 0, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
|
||||
# ND pooling/unpooling
|
||||
Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
|
||||
MaxUnpooling(unpoolInput, poolInput, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxUnpooling' ; inputs = (unpoolInput : poolInput); kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
|
||||
|
@ -264,13 +436,10 @@ Mean(dataVectorSequence, tag='') = new ComputationNode [ operation = 'Mean' ; in
|
|||
Negate(input, tag='') = new ComputationNode [ operation = 'Negate' ; inputs = input /*plus the function args*/ ]
|
||||
PackedIndex(targetObject, indexSequence, tag='') = new ComputationNode [ operation = 'PackedIndex' ; inputs = (targetObject : indexSequence) /*plus the function args*/ ]
|
||||
PerDimMeanVarDeNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarDeNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
|
||||
PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
|
||||
#PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
|
||||
PerDimMeanVarNormalization (x, mean, invStdDev) = (x - mean) .* invStdDev
|
||||
Reciprocal(z, tag='') = new ComputationNode [ operation = 'Reciprocal' ; inputs = z /*plus the function args*/ ]
|
||||
//# the following is a temporary workaround until we have the C++ version
|
||||
#ReduceLogSum (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "LogSum" /*plus the function args*/ ]
|
||||
#ReduceMean (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Mean" /*plus the function args*/ ]
|
||||
#ReduceMax (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Max" /*plus the function args*/ ]
|
||||
#ReduceMin (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Min" /*plus the function args*/ ]
|
||||
Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ]
|
||||
# TODO: Scale = ElementTimes
|
||||
ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = (cond : indexSequence : sourceData) /*plus the function args*/ ]
|
||||
|
@ -300,15 +469,52 @@ TransposeTimes(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operatio
|
|||
Where(cond, tag='') = new ComputationNode [ operation = 'Where' ; inputs = cond /*plus the function args*/ ]
|
||||
|
||||
##############################################################################
|
||||
# common macros
|
||||
# non-neural-network functions
|
||||
##############################################################################
|
||||
|
||||
BFF(in, rows, cols) = [ B = Parameter(rows, 1, init = 'fixedValue', value = 0) ; W = Parameter(rows, cols) ; z = W*in+B ]
|
||||
Print(value, format='') = new PrintAction [ what = value /*; how = format*/ ]
|
||||
Fail(what) = new FailAction [ /*what*/ ]
|
||||
Format(value, format) = new StringFunction [ what = 'Format' ; arg = value ; how = format ]
|
||||
Replace(s, from, to) = new StringFunction [ what = 'Replace' ; arg = s ; replacewhat = from ; withwhat = to ]
|
||||
Substr(s, begin, num) = new StringFunction [ what = 'Substr' ; arg = s ; pos = begin ; chars = num ]
|
||||
Chr(c) = new StringFunction [ what = 'Chr' ; arg = c ]
|
||||
Length(x) = new NumericFunction [ what = 'Length' ; arg = x ]
|
||||
Repeat (N, what) = if N <= 0 then BS.Constants.None else (Repeat (N-1, what) : what) # can also be used to turn a scalar into a 1-element array
|
||||
_ForceResizeArray (N, arrayOrScalar) = { # bring an array to a given length, either by chopping or by duplicating its last value
|
||||
arr = _AsArray (arrayOrScalar)
|
||||
L = Length (arr)
|
||||
res = if N < L then array[0..N-1] (i => arr[i]) # chop to length
|
||||
else if L == 0 then Fail ("BottomlessExpansion(): needs at least one element to expand.")
|
||||
else _ConcatArrays (arr, Repeat (N-L, arr[L-1])) # append copies of the last value
|
||||
}.res
|
||||
_AsArray (x) = if IsArray (x) then x else [| x |] # helper to allow dimensions to describe scalars (42) or tensors (13:42)
|
||||
_ConcatArrays (aOrScalar, bOrScalar) = {
|
||||
a = _AsArray (aOrScalar) ; b = _AsArray (bOrScalar)
|
||||
newLen = Length (a)+Length(b)
|
||||
res = if newLen == 0 then BS.Constants.None else array[0..newLen-1] (i => if i < Length (a) then a[i] else b[i-Length (a)])
|
||||
}.res
|
||||
Sign(x) = if x > 0 then 1 else if x < 0 then -1 else 0
|
||||
Min(a,b) = if a < b then a else b
|
||||
Max(a,b) = if a > b then a else b
|
||||
Fac(n) = if n > 1 then Fac(n-1) * n else 1
|
||||
IsSameObject(a,b) = new CompareFunction [ what = 'IsSameObject' ; args = (a : b) ]
|
||||
IsArray(a) = new CompareFunction [ what = 'IsArray' ; args = a ]
|
||||
Mod(x, y) = new NumericFunction [ what = 'Mod' ; args = (x:y) ]
|
||||
IntDiv(x, y) = new NumericFunction [ what = 'IntDiv' ; args = (x:y) ]
|
||||
|
||||
##############################################################################
|
||||
# macros from NDL book
|
||||
##############################################################################
|
||||
|
||||
BFF(in, rows, cols) = [ B = Parameter(rows, 1, initValue = 0) ; W = Parameter(rows, cols) ; z = W*in+B ]
|
||||
SBFF(in, rows, cols) = [ Eh = Sigmoid(BFF(in, rows, cols).z) ]
|
||||
MeanVarNorm(feat) = PerDimMeanVarNormalization(feat, Mean(feat), InvStdDev(feat))
|
||||
LogPrior(labels) = Log(Mean(labels))
|
||||
|
||||
Embedding (embeddingDim, input, inputDim=input.dim, initFrom='fromFile'/*|gaussian|uniform*/, embeddingPath = '', sparseInput = false, learningRateWeight = 0.0) = [
|
||||
# specify one of these two for initialization:
|
||||
# - init = "uniform"|"gaussian"
|
||||
# - embeddingFile = PATHNAME
|
||||
Embedding (embeddingDim, input, inputDim=input.dim, initFrom=''/*|fromFile|gaussian|uniform*/, embeddingPath = '', sparseInput = false, learningRateWeight = 0.0) = [
|
||||
embedding = Transpose (LearnableParameter (inputDim, embeddingDim, learningRateMultiplier = learningRateWeight, init = initFrom, initFromFilePath = embeddingPath))
|
||||
lookup = if sparseInput then embedding * input
|
||||
else GatherPacked (input, embedding)
|
||||
|
@ -341,7 +547,7 @@ Constants = [
|
|||
# is this like Sequences.Repeat?
|
||||
True = 1
|
||||
False = 0
|
||||
None = ConstantTensor (42, (1))
|
||||
None = [| |] # doubles up as an empty array. Note: only use [| |] syntax inside here, as it may change in the future
|
||||
IsNone (x) = IsSameObject (x, None)
|
||||
]
|
||||
|
||||
|
@ -553,7 +759,7 @@ Parameters =
|
|||
[
|
||||
WeightParam (outputDim, inputDim) = Parameter (outputDim, inputDim, init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1)
|
||||
DiagWeightParam (outputDim) = ParameterTensor ((outputDim), init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1) # meant to be applied elementwise
|
||||
BiasParam (dim) = ParameterTensor ((dim), init='fixedValue', value=0.0)
|
||||
BiasParam (dim) = ParameterTensor ((dim), initValue=0.0)
|
||||
ScalarParam() = BiasParam (1)
|
||||
|
||||
# route input through an extra weight, for stabilization
|
||||
|
@ -561,16 +767,16 @@ Parameters =
|
|||
if enabled
|
||||
then [
|
||||
#beta = Exp (BiasParam ((inputDim))) # init value is 0
|
||||
#beta = ParameterTensor ((inputDim), init='fixedValue', value=1.0) # init value is 1
|
||||
#beta = ParameterTensor ((inputDim), initValue=1.0) # init value is 1
|
||||
# or SoftPlus: ln(1+e^beta)
|
||||
#beta = Log (Constants.One + Exp (ParameterTensor ((inputDim), init='fixedValue', value=0.54132485/*ln (e-1)*/))) # init value is 1
|
||||
#beta = Log (Constants.One + Exp (ParameterTensor ((inputDim), initValue=0.54132485/*ln (e-1)*/))) # init value is 1
|
||||
|
||||
# sharpened Softplus: 1/f ln(1+e^{f*beta})
|
||||
# this behaves linear for weights around 1, yet guarantees positiveness
|
||||
|
||||
f = ConstantTensor (4, (1))
|
||||
fInv = Reciprocal (f)
|
||||
beta = fInv .* Log (Constants.One + Exp (f .* ParameterTensor ((inputDim), init='fixedValue', value=0.99537863/* 1/f*ln (e^f-1) */))) # init value is 1
|
||||
beta = fInv .* Log (Constants.One + Exp (f .* ParameterTensor ((inputDim), initValue=0.99537863/* 1/f*ln (e^f-1) */))) # init value is 1
|
||||
|
||||
TraceDense (h, what) = h # delete h and uncomment Trace to trace the beta values. They are a valuable indicator.
|
||||
//Trace (h, say=what, logFirst=10, logFrequency=100, logGradientToo=false, onlyUpToRow=9, onlyUpToT=25, format=[ type = "real" ; transpose = false ; precisionFormat = ".6" ])
|
||||
|
@ -1033,6 +1239,7 @@ Seq2Seq =
|
|||
|
||||
Network = [
|
||||
Load(pathName) = new ComputationNetworkFromFile [ /*pathName; also needs 'precision' somewhere*/ ]
|
||||
CloneFunction (inputNodes, outputNodes, parameters="learnable" /*|"constant"|"shared"*/) = new CloneFunctionConfigLambda [ /*args*/ ]
|
||||
Edit(inputModel, editFunctions, additionalRoots) = new ComputationNetworkWithEdits [ /*inputModel, editFunctions, additionalRoots*/ ]
|
||||
|
||||
Editing = [
|
||||
|
|
|
@ -8,6 +8,10 @@
|
|||
#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _
|
||||
|
||||
#include "stdafx.h"
|
||||
#ifdef _WIN32
|
||||
#include <crtdbg.h>
|
||||
#endif
|
||||
|
||||
#include "Basics.h"
|
||||
#include "Actions.h"
|
||||
#include "ComputationNetwork.h"
|
||||
|
@ -18,6 +22,7 @@
|
|||
#include "NDLNetworkBuilder.h"
|
||||
#include "ModelEditLanguage.h"
|
||||
#include "CPUMatrix.h" // used for SetNumThreads()
|
||||
#include "GPUMatrix.h" // used for SyncGuard::EnableSync()
|
||||
#include "CommonMatrix.h"
|
||||
#include "SGD.h"
|
||||
#include "MPIWrapper.h"
|
||||
|
@ -440,11 +445,6 @@ static wstring PathToBSStringLiteral(const wstring& path) // quote a pathname fo
|
|||
return L'"' + path + L'"';
|
||||
}
|
||||
|
||||
// TODO: decide where these should go. Also, do we need three variables?
|
||||
//extern wstring standardFunctions;
|
||||
//extern wstring commonMacros;
|
||||
//extern wstring computationNodes;
|
||||
|
||||
int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & reports Win32 exceptions
|
||||
{
|
||||
vector<wstring> args(argv, argv + argc);
|
||||
|
@ -488,7 +488,6 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
bs += L"include \'cntk.core.bs'"; // start with including the standard macros
|
||||
|
||||
// Note: Using lowercase ^^ here to match the Linux name of the CNTK exe.
|
||||
//bs += standardFunctions + computationNodes + commonMacros + L"\n";
|
||||
for (const auto& sourceFile : sourceFiles)
|
||||
bs += L"include " + PathToBSStringLiteral(sourceFile) + L"\n";
|
||||
bs += L"\n]\n";
|
||||
|
@ -538,6 +537,10 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
|
||||
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
|
||||
|
||||
bool synchronizeCUDAKernelExecutions = config(L"synchronizeCUDAKernelExecutions", false);
|
||||
if (synchronizeCUDAKernelExecutions)
|
||||
SyncGuard::EnableSync();
|
||||
|
||||
// logging
|
||||
wstring logpath = config(L"stderr", L"");
|
||||
if (logpath != L"")
|
||||
|
@ -581,13 +584,11 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
if (actionsVal.Is<ScriptableObjects::ConfigArray>())
|
||||
{
|
||||
const ScriptableObjects::ConfigArray& actions = actionsVal;
|
||||
for (int i = actions.GetIndexRange().first; i <= actions.GetIndexRange().second; i++)
|
||||
for (int i = actions.GetIndexBeginEnd().first; i < actions.GetIndexBeginEnd().second; i++)
|
||||
{
|
||||
// TODO: When running in parallel with MPI, only commands in 'commandstoRunOnAllRanks' should
|
||||
// be run in parallel across multiple ranks. Others should only run on rank 0
|
||||
actions.At(i, [](const wstring&)
|
||||
{
|
||||
}); // this will evaluate and thus execute the action
|
||||
actions.At(i, [](const wstring&){}); // this will evaluate and thus execute the action
|
||||
}
|
||||
}
|
||||
// else action has already been executed, see comment above
|
||||
|
@ -823,15 +824,38 @@ static void LogDelayLoadError(PEXCEPTION_POINTERS pExcPointers)
|
|||
}
|
||||
}
|
||||
|
||||
#if _DEBUG
|
||||
// in case of asserts in debug mode, print the message into stderr and throw exception
|
||||
int HandleDebugAssert(int, // reportType - ignoring reportType, printing message and aborting for all reportTypes
|
||||
char *message, // message - fully assembled debug user message
|
||||
int * returnValue) // returnValue - retVal value of zero continues execution
|
||||
{
|
||||
fprintf(stderr, "C-Runtime: %s\n", message);
|
||||
|
||||
if (returnValue) {
|
||||
*returnValue = 0; // return value of 0 will continue operation and NOT start the debugger
|
||||
}
|
||||
|
||||
return TRUE; // returning TRUE will make sure no message box is displayed
|
||||
}
|
||||
#endif
|
||||
|
||||
int wmain(int argc, wchar_t* argv[]) // wmain wrapper that reports Win32 exceptions
|
||||
{
|
||||
set_terminate(TerminateThis); // insert a termination handler to ensure stderr gets flushed before actually terminating
|
||||
_set_error_mode(_OUT_TO_STDERR); // make sure there are no CRT prompts when CNTK is executing
|
||||
|
||||
// Note: this does not seem to work--processes with this seem to just hang instead of terminating
|
||||
__try
|
||||
{
|
||||
return wmain1(argc, argv);
|
||||
// in case of asserts in debug mode, print the message into stderr and throw exception
|
||||
if (_CrtSetReportHook2(_CRT_RPTHOOK_INSTALL, HandleDebugAssert) == -1) {
|
||||
LOGPRINTF(stderr, "CNTK: _CrtSetReportHook2 failed.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
int mainReturn = wmain1(argc, argv);
|
||||
_CrtSetReportHook2(_CRT_RPTHOOK_REMOVE, HandleDebugAssert);
|
||||
|
||||
return mainReturn;
|
||||
}
|
||||
__except (LogDelayLoadError(GetExceptionInformation()), EXCEPTION_EXECUTE_HANDLER)
|
||||
{
|
||||
|
|
|
@ -81,7 +81,7 @@
|
|||
<StackReserveSize>100000000</StackReserveSize>
|
||||
</Link>
|
||||
<PreBuildEvent>
|
||||
<Command>prebuild.bat "$(Configuration)" "$(CudaPath)"</Command>
|
||||
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
|
||||
</PreBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(ReleaseBuild)">
|
||||
|
@ -109,7 +109,7 @@
|
|||
<StackReserveSize>100000000</StackReserveSize>
|
||||
</Link>
|
||||
<PreBuildEvent>
|
||||
<Command>prebuild.bat "$(Configuration)" "$(CudaPath)"</Command>
|
||||
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
|
||||
</PreBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
|
||||
|
@ -144,6 +144,7 @@
|
|||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\Common\CrossProcessMutex.h" />
|
||||
<ClInclude Include="..\Common\Include\basetypes.h" />
|
||||
<ClInclude Include="..\Common\Include\Basics.h" />
|
||||
<ClInclude Include="..\Common\Include\BestGpu.h" />
|
||||
<ClInclude Include="..\Common\Include\DataReader.h" />
|
||||
|
@ -199,7 +200,6 @@
|
|||
<ItemGroup>
|
||||
<ClCompile Include="BrainScript\BrainScriptEvaluator.cpp" />
|
||||
<ClCompile Include="BrainScript\BrainScriptParser.cpp" />
|
||||
<ClCompile Include="BrainScript\BrainScriptTest.cpp" />
|
||||
<ClCompile Include="CNTK.cpp" />
|
||||
<ClCompile Include="ModelEditLanguage.cpp" />
|
||||
<ClCompile Include="stdafx.cpp" />
|
||||
|
@ -222,4 +222,4 @@
|
|||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets" />
|
||||
</Project>
|
||||
</Project>
|
|
@ -1,18 +1,6 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\Common\DataReader.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\DataWriter.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\File.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\fileutil.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="ModelEditLanguage.cpp">
|
||||
<Filter>Model Editing</Filter>
|
||||
</ClCompile>
|
||||
|
@ -22,34 +10,13 @@
|
|||
<ClCompile Include="tests.cpp">
|
||||
<Filter>Misc</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\TimerUtility.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="CNTK.cpp" />
|
||||
<ClCompile Include="..\Common\MPIWrapper.cpp">
|
||||
<Filter>MPI Interfacing</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\Include\ConcStack.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\Config.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="BrainScript\BrainScriptEvaluator.cpp">
|
||||
<Filter>BrainScript</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="BrainScript\BrainScriptParser.cpp">
|
||||
<Filter>BrainScript</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="BrainScript\BrainScriptTest.cpp">
|
||||
<Filter>BrainScript</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\CompositeDataReader.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\Common\Include\fileutil.h">
|
||||
|
@ -205,9 +172,8 @@
|
|||
<ClInclude Include="..\Readers\ReaderLib\Transformer.h">
|
||||
<Filter>from ReaderLib</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\CompositeDataReader.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\basetypes.h" />
|
||||
<ClInclude Include="..\Readers\CompositeDataReader\CompositeDataReader.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Text Include="modelEditor.txt">
|
||||
|
|
|
@ -591,7 +591,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
|
|||
std::string paramPath = params[1];
|
||||
|
||||
NetNdl<ElemType>* netNdl;
|
||||
vector<ComputationNodeBasePtr> nodes = FindSymbols(params[0], netNdl);
|
||||
vector<ComputationNodeBasePtr> nodes = FindSymbols(nodeName, netNdl);
|
||||
|
||||
for (auto& pNodes : nodes)
|
||||
{
|
||||
|
|
|
@ -180,7 +180,7 @@ public:
|
|||
auto nodePtr = builder.CreateLearnableParameter(name, 1, 1);
|
||||
ndlNode->SetEvalValue(nodePtr.get());
|
||||
ElemType val = ndlNode->GetScalar();
|
||||
nodePtr->Value().SetValue(val);
|
||||
cn->InitLearnableParameters(nodePtr, L"fixedValue", val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,10 +7,23 @@ setlocal enableDelayedexpansion
|
|||
::: for full license information.
|
||||
::: ==============================================================================
|
||||
:::
|
||||
::: This is called as a pre-build step for the CNTK executable.
|
||||
::: It receives the build's configuration, $(Configuration), as first paramter.
|
||||
::: This is called as a pre-build step for the CNTK executable, taking parameters below.
|
||||
::: It creates buildinfo.h, which makes version information available to the executable itself.
|
||||
|
||||
:: Grab the parameters
|
||||
::
|
||||
:: Note: don't rely on environment variables, since properties may have been
|
||||
:: overridden at msbuild invocation. By convention, we let parameters start with p_, locals with l_.
|
||||
:: A Vim search for [%!]\([lp]_\)\@!\w\+[%!:] should only match
|
||||
:: well-known (non-CNTK-specific) environment variables.
|
||||
set p_Configuration=%~1
|
||||
set p_CNTK_MKL=%~2
|
||||
set p_CNTK_MKL_SEQUENTIAL=%~3
|
||||
set p_CNTK_ENABLE_1BitSGD=%~4
|
||||
set p_CudaPath=%~5
|
||||
set p_CUDNN_PATH=%~6
|
||||
set p_CUB_PATH=%~7
|
||||
|
||||
echo #ifndef _BUILDINFO_H > buildinfo.h$$
|
||||
echo #define _BUILDINFO_H >> buildinfo.h$$
|
||||
|
||||
|
@ -23,19 +36,19 @@ if not errorlevel 1 (
|
|||
call git --version > NUL 2>&1
|
||||
if not errorlevel 1 (
|
||||
echo #define _GIT_EXIST >> buildinfo.h$$
|
||||
FOR /F %%i IN ('call git rev-parse --abbrev-ref HEAD') DO SET BRANCH=%%i
|
||||
FOR /F %%i IN ('call git rev-parse HEAD') DO SET COMMIT=%%i
|
||||
set STATUS=
|
||||
FOR /F %%i IN ('call git rev-parse --abbrev-ref HEAD') DO SET l_BRANCH=%%i
|
||||
FOR /F %%i IN ('call git rev-parse HEAD') DO SET l_COMMIT=%%i
|
||||
set l_STATUS=
|
||||
call git diff --quiet --cached
|
||||
if not errorlevel 1 call git diff --quiet
|
||||
if errorlevel 1 set STATUS= ^(modified^)
|
||||
echo #define _BUILDBRANCH_ "!BRANCH!" >> buildinfo.h$$
|
||||
echo #define _BUILDSHA1_ "!COMMIT!!STATUS!">> buildinfo.h$$
|
||||
if errorlevel 1 set l_STATUS= ^(modified^)
|
||||
echo #define _BUILDBRANCH_ "!l_BRANCH!" >> buildinfo.h$$
|
||||
echo #define _BUILDSHA1_ "!l_COMMIT!!l_STATUS!">> buildinfo.h$$
|
||||
)
|
||||
)
|
||||
|
||||
if "%CNTK_MKL%" == "1" (
|
||||
if "%CNTK_MKL_SEQUENTIAL%" == "1" (
|
||||
if "%p_CNTK_MKL%" == "1" (
|
||||
if "%p_CNTK_MKL_SEQUENTIAL%" == "1" (
|
||||
echo #define _MATHLIB_ "mkl-sequential">> buildinfo.h$$
|
||||
) else (
|
||||
echo #define _MATHLIB_ "mkl">> buildinfo.h$$
|
||||
|
@ -49,42 +62,40 @@ echo #define _BUILDER_ "%USERNAME%" >> buildinfo.h$$
|
|||
echo #define _BUILDER_ "%USERNAME%" >> buildinfo.h$$
|
||||
echo #define _BUILDMACHINE_ "%HOST%" >> buildinfo.h$$
|
||||
|
||||
set scriptpath=%~dp0
|
||||
set buildpath="%scriptpath:\=\\%"
|
||||
echo #define _BUILDPATH_ %buildpath% >> buildinfo.h$$
|
||||
set l_scriptpath=%~dp0
|
||||
set l_buildpath="%l_scriptpath:\=\\%"
|
||||
echo #define _BUILDPATH_ %l_buildpath% >> buildinfo.h$$
|
||||
|
||||
set build_type=Unknown
|
||||
set build_target=Unknown
|
||||
set l_build_type=Unknown
|
||||
set l_build_target=Unknown
|
||||
:: Configuration property provided by CNTK.vcxproj
|
||||
if /i "%~1" == "Debug" set build_type=Debug&set build_target=GPU
|
||||
if /i "%~1" == "Debug_CpuOnly" set build_type=Debug&set build_target=CPU-only
|
||||
if /i "%~1" == "Release" set build_type=Release&set build_target=GPU
|
||||
if /i "%~1" == "Release_CpuOnly" set build_type=Release&set build_target=CPU-only
|
||||
if /i "%p_Configuration%" == "Debug" set l_build_type=Debug&set l_build_target=GPU
|
||||
if /i "%p_Configuration%" == "Debug_CpuOnly" set l_build_type=Debug&set l_build_target=CPU-only
|
||||
if /i "%p_Configuration%" == "Release" set l_build_type=Release&set l_build_target=GPU
|
||||
if /i "%p_Configuration%" == "Release_CpuOnly" set l_build_type=Release&set l_build_target=CPU-only
|
||||
|
||||
echo #define _BUILDTYPE_ "%build_type%">> buildinfo.h$$
|
||||
echo #define _BUILDTARGET_ "%build_target%">> buildinfo.h$$
|
||||
echo #define _BUILDTYPE_ "%l_build_type%">> buildinfo.h$$
|
||||
echo #define _BUILDTARGET_ "%l_build_target%">> buildinfo.h$$
|
||||
|
||||
if "%CNTK_ENABLE_1BitSGD%" == "true" (
|
||||
if "%p_CNTK_ENABLE_1BitSGD%" == "true" (
|
||||
echo #define _WITH_1BITSGD_ "yes">>buildinfo.h$$
|
||||
) else (
|
||||
echo #define _WITH_1BITSGD_ "no">>buildinfo.h$$
|
||||
)
|
||||
|
||||
if not %build_target% == CPU-only (
|
||||
:: CudaPath property provided by CNTK.vcxproj
|
||||
if "%~2%" == "" (
|
||||
if not %l_build_target% == CPU-only (
|
||||
if "%p_CudaPath%" == "" (
|
||||
echo #define _CUDA_PATH_ "NOT_DEFINED" >> buildinfo.h$$
|
||||
) else (
|
||||
set cudaPathTemp=%~2
|
||||
echo #define _CUDA_PATH_ "!cudaPathTemp:\=\\!" >> buildinfo.h$$
|
||||
echo #define _CUDA_PATH_ "!p_CudaPath:\=\\!" >> buildinfo.h$$
|
||||
)
|
||||
|
||||
if not "%cudnn_path%" == "" (
|
||||
echo #define _CUDNN_PATH_ "%cudnn_path:\=\\%" >> buildinfo.h$$
|
||||
if not "%p_CUDNN_PATH%" == "" (
|
||||
echo #define _CUDNN_PATH_ "%p_CUDNN_PATH:\=\\%" >> buildinfo.h$$
|
||||
)
|
||||
|
||||
if not "%cub_path%" == "" (
|
||||
echo #define _CUB_PATH_ "%cub_path:\=\\%" >> buildinfo.h$$
|
||||
if not "%p_CUB_PATH%" == "" (
|
||||
echo #define _CUB_PATH_ "%p_CUB_PATH:\=\\%" >> buildinfo.h$$
|
||||
)
|
||||
)
|
||||
|
||||
|
|
|
@ -153,6 +153,11 @@ namespace CNTK
|
|||
static const size_t InferredDimension = (size_t)-1;
|
||||
|
||||
public:
|
||||
///
|
||||
/// Construct a NDShape with 0 axes, which denotes a scalar.
|
||||
///
|
||||
NDShape() {}
|
||||
|
||||
///
|
||||
/// Contruct a NDShape instance with the specified number of axes and dimensionality in each axis.
|
||||
///
|
||||
|
@ -285,6 +290,7 @@ namespace CNTK
|
|||
class NDArrayView final : public std::enable_shared_from_this<NDArrayView>
|
||||
{
|
||||
friend class CompositeFunction;
|
||||
friend class LearnerBase;
|
||||
|
||||
template <typename T, typename ...CtorArgTypes>
|
||||
friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
|
||||
|
@ -429,6 +435,16 @@ namespace CNTK
|
|||
///
|
||||
bool IsReadOnly() const { return m_isReadOnly; }
|
||||
|
||||
// TODO: The set methods should be offered in template from
|
||||
///
|
||||
/// Fill 'this' NDArrayView with the specified value. The underlying DataType of 'this' view should be DataType::Float.
|
||||
///
|
||||
CNTK_API void SetValue(float value);
|
||||
|
||||
///
|
||||
/// Fill 'this' NDArrayView with the specified value. The underlying DataType of 'this' view should be DataType::Double.
|
||||
///
|
||||
CNTK_API void SetValue(double value);
|
||||
///
|
||||
/// Creates a new NDArrayView with newly allocated storage on the same device as 'this' view and copies 'this' view's contents into the newly allocated view.
|
||||
///
|
||||
|
@ -467,8 +483,6 @@ namespace CNTK
|
|||
private:
|
||||
CNTK_API NDArrayView(CNTK::DataType dataType, const DeviceDescriptor& device, CNTK::StorageFormat storageType, const NDShape& viewShape, bool readOnly, void* tensorView);
|
||||
|
||||
CNTK_API void SetValue(float value);
|
||||
CNTK_API void SetValue(double value);
|
||||
|
||||
template <typename ElementType>
|
||||
static std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetMatrixImpl(const Microsoft::MSR::CNTK::TensorView<ElementType>* tensorView, size_t rowColSplitPoint);
|
||||
|
@ -526,6 +540,11 @@ namespace CNTK
|
|||
///
|
||||
CNTK_API void Clear();
|
||||
|
||||
///
|
||||
/// Returns the number of masked/invalid values
|
||||
///
|
||||
CNTK_API size_t MaskedCount() const;
|
||||
|
||||
///
|
||||
/// Returns the descriptor of the device that 'this' mask resides on
|
||||
///
|
||||
|
@ -536,6 +555,11 @@ namespace CNTK
|
|||
///
|
||||
const NDShape& Shape() const { return m_maskShape; }
|
||||
|
||||
///
|
||||
/// Returns a read-only pointer to the data buffer underlying 'this' Mask object
|
||||
///
|
||||
CNTK_API const char* DataBuffer() const;
|
||||
|
||||
///
|
||||
/// Creates a new NDMask with newly allocated storage on the same device as 'this' mask and copies 'this' mask's contents into the newly allocated mask.
|
||||
///
|
||||
|
@ -760,7 +784,21 @@ namespace CNTK
|
|||
///
|
||||
/// Create an 'Input' Variable.
|
||||
///
|
||||
Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name = L"")
|
||||
Variable(const NDShape& shape, CNTK::DataType dataType)
|
||||
: Variable(shape, dataType, L"")
|
||||
{}
|
||||
|
||||
///
|
||||
/// Create an 'Input' Variable.
|
||||
///
|
||||
Variable(const NDShape& shape, CNTK::DataType dataType, const wchar_t* name)
|
||||
: Variable(shape, dataType, std::wstring(name))
|
||||
{}
|
||||
|
||||
///
|
||||
/// Create an 'Input' Variable.
|
||||
///
|
||||
Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name)
|
||||
: Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, false, { Axis::DefaultDynamicAxis() }, false, name)
|
||||
{}
|
||||
|
||||
|
@ -919,6 +957,10 @@ namespace CNTK
|
|||
return first.m_dataFields == second.m_dataFields;
|
||||
}
|
||||
|
||||
inline bool operator!=(const Variable& first, const Variable& second)
|
||||
{
|
||||
return !(first == second);
|
||||
}
|
||||
///
|
||||
/// Denotes Parameter inputs of a Function.
|
||||
///
|
||||
|
@ -1146,7 +1188,7 @@ namespace CNTK
|
|||
/// and the user is responsible for ensuring that the contents of the inputs and outputs are unchanged until after any uses of the BackPropState instance
|
||||
/// for backpropagating gradients through this function.
|
||||
///
|
||||
CNTK_API virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
|
||||
CNTK_API virtual BackPropStatePtr Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
|
||||
std::unordered_map<Variable, ValuePtr>& outputs,
|
||||
const DeviceDescriptor& computeDevice = DeviceDescriptor::DefaultDevice(),
|
||||
const std::unordered_set<Variable>& outputsToRetainBackwardStateFor = {}) = 0;
|
||||
|
@ -1161,7 +1203,7 @@ namespace CNTK
|
|||
/// computation that this gradient backpropagation corresponds to.
|
||||
///
|
||||
CNTK_API virtual void Backward(const BackPropStatePtr& state,
|
||||
const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
|
||||
const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
|
||||
std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) = 0;
|
||||
|
||||
public:
|
||||
|
@ -1330,10 +1372,74 @@ namespace CNTK
|
|||
};
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in matrix multiplication operation with the specified input operands.
|
||||
/// TODO: Specify the constraints on the shapes of the operands.
|
||||
/// Create an instance of the CNTK built-in elementwise negate operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
CNTK_API FunctionPtr Negate(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise sigmoid operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise tanh operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Tanh(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise linear rectifier operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr ReLU(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise exp operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Exp(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise log operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Log(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise square operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Square(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise square-root operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Sqrt(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise round operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Round(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise floor operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Floor(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise ceil operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Ceil(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise abs operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Abs(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise reciprocal operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Reciprocal(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in softmax operation on specified tensor input operand
|
||||
///
|
||||
CNTK_API FunctionPtr Softmax(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise tensor addition operation with the specified input operands.
|
||||
|
@ -1341,30 +1447,71 @@ namespace CNTK
|
|||
CNTK_API FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise sigmoid operation with the specified input operand.
|
||||
/// Create an instance of the CNTK built-in elementwise tensor subtraction operation with the specified input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name = L"");
|
||||
CNTK_API FunctionPtr Minus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise tanh operation with the specified input operand.
|
||||
/// Create an instance of the CNTK built-in elementwise multiplication operation on specified tensor input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr Tanh(const Variable& operand, const std::wstring& name = L"");
|
||||
CNTK_API FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise division operation on specified tensor input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr ElementDivide(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise equality comparison operation on specified tensor input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr Equal(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise not-equal comparison operation on specified tensor input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr NotEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise less than comparison operation on specified tensor input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr Less(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise less than or equal to comparison operation on specified tensor input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr LessEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise greater than comparison operation on specified tensor input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr Greater(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise greater than or equal to comparison operation on specified tensor input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr GreaterEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in matrix multiplication operation with the specified input operands.
|
||||
/// TODO: Specify the constraints on the shapes of the operands.
|
||||
///
|
||||
CNTK_API FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes = 1, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in operation to compute squared-error for specified input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in operation to compute cross-entropy with softmax for specified input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr CrossEntropyWithSoftmax(const Variable& output, const Variable& labels, const std::wstring& name = L"");
|
||||
CNTK_API FunctionPtr CrossEntropyWithSoftmax(const Variable& prediction, const Variable& labels, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in operation for computing the classification prediction error for specified operands.
|
||||
///
|
||||
CNTK_API FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise exp operation with the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr Exp(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in operation for getting the past value along the lone dynamic axis of the specified operand.
|
||||
/// Throws an exception of the operand has more than one dynamic axis.
|
||||
|
@ -1379,21 +1526,582 @@ namespace CNTK
|
|||
///
|
||||
CNTK_API FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in elementwise multiplication operation on specified tensor input operands.
|
||||
///
|
||||
CNTK_API FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in sum reduction operation on specified tensor input operand along all the axes
|
||||
///
|
||||
CNTK_API FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Per dimension mean-variance normalization of the specified input operand.
|
||||
///
|
||||
CNTK_API FunctionPtr PerDimMeanVarianceNormalize(const Variable& operand, const NDArrayViewPtr& mean, const NDArrayViewPtr& invStdDev, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// TODO:
|
||||
///
|
||||
CNTK_API FunctionPtr Convolution(const Variable& convolutionMap,
|
||||
const Variable& operand,
|
||||
const NDShape& strides = {1},
|
||||
const std::vector<bool>& sharing = {true},
|
||||
const std::vector<bool>& autoPadding = {true},
|
||||
const NDShape& lowerPad = {0},
|
||||
const NDShape& upperPad = {0},
|
||||
bool transpose = false,
|
||||
size_t maxTempMemSizeInSamples = 0,
|
||||
const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// TODO:
|
||||
///
|
||||
enum class PoolingType
|
||||
{
|
||||
Max,
|
||||
Average,
|
||||
};
|
||||
|
||||
///
|
||||
/// TODO:
|
||||
///
|
||||
CNTK_API FunctionPtr Pooling(const Variable& operand,
|
||||
PoolingType poolingType,
|
||||
const NDShape& poolingWindowShape,
|
||||
const NDShape& strides = {1},
|
||||
const std::vector<bool>& autoPadding = {false},
|
||||
const NDShape& lowerPad = {0},
|
||||
const NDShape& upperPad = {0},
|
||||
const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// TODO:
|
||||
///
|
||||
CNTK_API FunctionPtr BatchNormalization(const Variable& operand,
|
||||
const Variable& scale,
|
||||
const Variable& bias,
|
||||
const Variable& runningMean,
|
||||
const Variable& runningInvStd,
|
||||
bool spacial,
|
||||
double normalizationTimeConstant = 0,
|
||||
double blendTimeConstant = 0,
|
||||
double epsilon = 0.00001,
|
||||
bool useCuDNNEngine = false,
|
||||
const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Create a new Function instance which just combines the outputs of the specified list of 'operands' Functions such that the 'Outputs' of the
|
||||
/// new 'Function' are union of the 'Outputs' of each of the specified 'operands' Functions.
|
||||
/// E.g. When creating a classification model, typically the CrossEntropy loss Function and the ClassificationError Function comprise the two roots
|
||||
/// of the computation graph which can be "Combine"d to create a single Function with 2 outputs; viz. CrossEntropy loss and ClassificationError output.
|
||||
///
|
||||
CNTK_API FunctionPtr Combine(const std::initializer_list<FunctionPtr>& operands, const std::wstring& name = L"");
|
||||
CNTK_API FunctionPtr Combine(const std::vector<FunctionPtr>& operands, const std::wstring& name = L"");
|
||||
|
||||
///
|
||||
/// Load a legacy CNTK v1 format model
|
||||
///
|
||||
template <typename ElementType>
|
||||
CNTK_API FunctionPtr LoadLegacyModel(const std::wstring& modelFile, const DeviceDescriptor& computeDevice = DeviceDescriptor::DefaultDevice());
|
||||
|
||||
///
|
||||
/// Save a Composite Function instance to a file in CNTK legacy model format
|
||||
///
|
||||
template <typename ElementType>
|
||||
CNTK_API void SaveAsLegacyModel(const FunctionPtr& rootFunction, const std::wstring& modelFile);
|
||||
|
||||
///
|
||||
/// A serializable value represents one of:
|
||||
/// a) Boolean
|
||||
/// b) Signed long integer
|
||||
/// c) Single and double precision floating point values
|
||||
/// d) NDShape
|
||||
/// e) vector<DictionaryValue>
|
||||
///
|
||||
/// TODO: We need to have native support for DictionaryValue<vector> and DictionaryValue<NDArrayView>.
|
||||
class DictionaryValue final
|
||||
{
|
||||
public:
|
||||
enum class Type : unsigned int
|
||||
{
|
||||
None,
|
||||
Bool,
|
||||
SizeT,
|
||||
Float,
|
||||
Double,
|
||||
String,
|
||||
NDShape,
|
||||
Vector,
|
||||
Dictionary,
|
||||
};
|
||||
|
||||
static const char* TypeName(Type type)
|
||||
{
|
||||
switch (type)
|
||||
{
|
||||
case Type::None:
|
||||
return "None";
|
||||
case Type::Bool:
|
||||
return "Bool";
|
||||
case Type::SizeT:
|
||||
return "SizeT";
|
||||
case Type::Float:
|
||||
return "Float";
|
||||
case Type::Double:
|
||||
return "Double";
|
||||
case Type::String:
|
||||
return "String";
|
||||
case Type::NDShape:
|
||||
return "NDShape";
|
||||
case Type::Vector:
|
||||
return "Vector";
|
||||
case Type::Dictionary:
|
||||
return "Dictionary";
|
||||
default:
|
||||
LogicError("Unknown DictionaryValue::Type");
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
DictionaryValue() : m_valueType(Type::None)
|
||||
{
|
||||
}
|
||||
|
||||
DictionaryValue(bool value) : m_valueType(GetValueType<bool>())
|
||||
{
|
||||
m_data.m_boolean = value;
|
||||
}
|
||||
|
||||
DictionaryValue(size_t value) : m_valueType(GetValueType<size_t>())
|
||||
{
|
||||
m_data.m_sizeT = value;
|
||||
}
|
||||
|
||||
DictionaryValue(float value) : m_valueType(GetValueType<float>())
|
||||
{
|
||||
m_data.m_float = value;
|
||||
}
|
||||
|
||||
DictionaryValue(double value) : m_valueType(GetValueType<double>())
|
||||
{
|
||||
m_data.m_double = value;
|
||||
}
|
||||
|
||||
DictionaryValue(const wchar_t* value)
|
||||
: DictionaryValue(std::wstring(value))
|
||||
{}
|
||||
template <typename T>
|
||||
DictionaryValue(const T& value) : m_valueType(GetValueType<T>())
|
||||
{
|
||||
static_assert(std::is_same<T, NDShape>::value ||
|
||||
std::is_same<T, std::wstring>::value ||
|
||||
std::is_same<T, std::vector<DictionaryValue>>::value ||
|
||||
std::is_same<T, Dictionary>::value,
|
||||
"Unsupported ValueType");
|
||||
|
||||
AllocateDataPtr(value);
|
||||
}
|
||||
|
||||
DictionaryValue(const DictionaryValue& other) : m_valueType(Type::Bool)
|
||||
{
|
||||
// The m_valueType must have been set to a non-ptr type to prevent an attempt to interpret
|
||||
// the underlying underlying uninitialized value as a ptr and free it.
|
||||
*this = other;
|
||||
}
|
||||
|
||||
DictionaryValue& operator=(const DictionaryValue& other)
|
||||
{
|
||||
if (this != &other)
|
||||
{
|
||||
FreeDataPtr();
|
||||
|
||||
m_valueType = other.m_valueType;
|
||||
m_data = other.m_data;
|
||||
|
||||
if (other.m_valueType == Type::String)
|
||||
AllocateDataPtr(other.GetValue<std::wstring>());
|
||||
else if (other.m_valueType == Type::NDShape)
|
||||
AllocateDataPtr(other.GetValue<NDShape>());
|
||||
else if (other.m_valueType == Type::Vector)
|
||||
AllocateDataPtr(other.GetValue<std::vector<DictionaryValue>>());
|
||||
else if (other.m_valueType == Type::Dictionary)
|
||||
AllocateDataPtr(other.GetValue<Dictionary>());
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
~DictionaryValue()
|
||||
{
|
||||
FreeDataPtr();
|
||||
}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_same<T, bool>::value>::type* = nullptr>
|
||||
const T& GetValue() const
|
||||
{
|
||||
VerifyType<T>();
|
||||
return m_data.m_boolean;
|
||||
}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_same<T, size_t>::value>::type* = nullptr>
|
||||
const T& GetValue() const
|
||||
{
|
||||
VerifyType<T>();
|
||||
return m_data.m_sizeT;
|
||||
}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_same<T, float>::value>::type* = nullptr>
|
||||
const T& GetValue() const
|
||||
{
|
||||
VerifyType<T>();
|
||||
return m_data.m_float;
|
||||
}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_same<T, double>::value>::type* = nullptr>
|
||||
const T& GetValue() const
|
||||
{
|
||||
VerifyType<T>();
|
||||
return m_data.m_double;
|
||||
}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value ||
|
||||
std::is_same<T, std::wstring>::value ||
|
||||
std::is_same<T, std::vector<DictionaryValue>>::value ||
|
||||
std::is_same<T, Dictionary>::value>::type* = nullptr>
|
||||
const T& GetValue() const
|
||||
{
|
||||
VerifyType<T>();
|
||||
return *(reinterpret_cast<T*>(m_data.m_ptr));
|
||||
}
|
||||
|
||||
bool HasValue() const
|
||||
{
|
||||
return m_valueType != Type::None;
|
||||
}
|
||||
|
||||
Type ValueType() const
|
||||
{
|
||||
return m_valueType;
|
||||
}
|
||||
|
||||
friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us);
|
||||
friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us);
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static Type GetValueType()
|
||||
{
|
||||
static_assert(std::is_same<T, bool>::value ||
|
||||
std::is_same<T, size_t>::value ||
|
||||
std::is_same<T, float>::value ||
|
||||
std::is_same<T, double>::value ||
|
||||
std::is_same<T, std::wstring>::value ||
|
||||
std::is_same<T, NDShape>::value ||
|
||||
std::is_same<T, std::vector<DictionaryValue>>::value ||
|
||||
std::is_same<T, Dictionary>::value,
|
||||
"Unsupported ValueType");
|
||||
|
||||
if (std::is_same<T, bool>::value) return Type::Bool;
|
||||
if (std::is_same<T, size_t>::value) return Type::SizeT;
|
||||
if (std::is_same<T, float>::value) return Type::Float;
|
||||
if (std::is_same<T, double>::value) return Type::Double;
|
||||
if (std::is_same<T, std::wstring>::value) return Type::String;
|
||||
if (std::is_same<T, NDShape>::value) return Type::NDShape;
|
||||
if (std::is_same<T, std::vector<DictionaryValue>>::value) return Type::Vector;
|
||||
if (std::is_same<T, Dictionary>::value) return Type::Dictionary;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void VerifyType() const
|
||||
{
|
||||
if (GetValueType<T>() != m_valueType)
|
||||
RuntimeError("Reading a DictionaryValue as the wrong type; Reading as type %s when actual type is %s", typeid(T).name(), DictionaryValue::TypeName(m_valueType));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CNTK_API void AllocateDataPtr(const T& value);
|
||||
|
||||
template <typename T>
|
||||
CNTK_API void FreePtrAsType();
|
||||
|
||||
CNTK_API void FreeDataPtr()
|
||||
{
|
||||
if (m_valueType == Type::String)
|
||||
FreePtrAsType<std::wstring>();
|
||||
else if (m_valueType == Type::NDShape)
|
||||
FreePtrAsType<NDShape>();
|
||||
else if (m_valueType == Type::Vector)
|
||||
FreePtrAsType<std::vector<DictionaryValue>>();
|
||||
else if (m_valueType == Type::Dictionary)
|
||||
FreePtrAsType<Dictionary>();
|
||||
}
|
||||
|
||||
Type m_valueType;
|
||||
|
||||
union ValueData
|
||||
{
|
||||
bool m_boolean;
|
||||
size_t m_sizeT;
|
||||
float m_float;
|
||||
double m_double;
|
||||
void* m_ptr;
|
||||
} m_data;
|
||||
|
||||
const size_t version = 1;
|
||||
};
|
||||
|
||||
///
|
||||
/// A type denoting a dictionary (keyed by Unicode strings) of serializable values (dynamically typed).
|
||||
///
|
||||
class Dictionary final
|
||||
{
|
||||
friend inline void AddConfigString(std::wstringstream& s, const DictionaryValue& value, size_t numIndentationSpaces);
|
||||
friend class CompositeMinibatchSource;
|
||||
public:
|
||||
CNTK_API Dictionary();
|
||||
CNTK_API ~Dictionary();
|
||||
|
||||
CNTK_API Dictionary(const Dictionary&);
|
||||
CNTK_API Dictionary& operator=(const Dictionary&);
|
||||
|
||||
CNTK_API Dictionary(Dictionary&& other);
|
||||
CNTK_API Dictionary& operator=(Dictionary&& other);
|
||||
|
||||
CNTK_API DictionaryValue& operator[](const wchar_t* key);
|
||||
DictionaryValue& operator[](const std::wstring& key)
|
||||
{
|
||||
return operator[](key.c_str());
|
||||
}
|
||||
|
||||
CNTK_API DictionaryValue operator[](const wchar_t* key) const;
|
||||
|
||||
DictionaryValue operator[](const std::wstring& key) const
|
||||
{
|
||||
return operator[](key.c_str());
|
||||
}
|
||||
|
||||
CNTK_API bool Contains(const wchar_t* key) const;
|
||||
|
||||
bool Contains(const std::wstring& key) const
|
||||
{
|
||||
return Contains(key.c_str());
|
||||
}
|
||||
|
||||
|
||||
friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us);
|
||||
friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us);
|
||||
|
||||
private:
|
||||
std::shared_ptr<std::unordered_map<std::wstring, DictionaryValue>> m_dictionaryData;
|
||||
const size_t version = 1;
|
||||
};
|
||||
|
||||
///
|
||||
/// Abstraction for learning a subset of parameters of a learnable function using first order gradient values
|
||||
/// For e.g momentum, AdaGrad, RMSProp etc. are different types of learners with their own algorithms for
|
||||
/// learning parameter values using first order gradients.
|
||||
///
|
||||
class Learner : public std::enable_shared_from_this<Learner>
|
||||
{
|
||||
public:
|
||||
//
|
||||
// Method to update the parameters associated with this learner. By returning false, this method indicates that
|
||||
// learning has stopped for all of the parameters associated with this learner
|
||||
//
|
||||
CNTK_API virtual bool Update(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) = 0;
|
||||
|
||||
///
|
||||
/// Returns the set of parameters associated with this learner.
|
||||
///
|
||||
const std::unordered_set<Parameter>& Parameters() const { return m_parameters; }
|
||||
|
||||
///
|
||||
/// Optionally overridable method to checkpoint the learner's state.
|
||||
///
|
||||
// TODO: move the following two methods into ISerializable interface, make
|
||||
// Learner (and all other entities that need checkpointing capability) implement it.
|
||||
CNTK_API virtual Dictionary GetCheckpointState() const { return Dictionary(); }
|
||||
|
||||
///
|
||||
/// Optionally overridable method to restore the learner's state from a previous checkpoint.
|
||||
///
|
||||
CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& /*checkpoint*/) {}
|
||||
|
||||
virtual ~Learner() {}
|
||||
|
||||
protected:
|
||||
Learner(const std::unordered_set<Parameter>& parameters)
|
||||
: m_parameters(parameters)
|
||||
{}
|
||||
|
||||
std::unordered_set<Parameter> m_parameters;
|
||||
|
||||
};
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in SGD learner.
|
||||
///
|
||||
/// TODO: add additional SGD parameters here (a collection of learning rate values)
|
||||
CNTK_API LearnerPtr SGDLearner(const std::unordered_set<Parameter>& parameters, double learningRatePerSample);
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in Momentum SGD learner.
|
||||
///
|
||||
/// TODO: add additional Momentum parameters here (a collection of momentum rate values)
|
||||
CNTK_API LearnerPtr MomentumSGDLearner(const std::unordered_set<Parameter>& parameters);
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in Nesterov's accelerated SGD learner.
|
||||
///
|
||||
CNTK_API LearnerPtr NesterovLearner(const std::unordered_set<Parameter>& parameters);
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in AdaGrad learner.
|
||||
///
|
||||
CNTK_API LearnerPtr AdaGradLearner(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier = true);
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in FSAdaGrad (improved AdaGrad) learner.
|
||||
///
|
||||
CNTK_API LearnerPtr FSAdaGradLearner(const std::unordered_set<Parameter>& parameters);
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in RMSProp learner.
|
||||
///
|
||||
CNTK_API LearnerPtr RMSPropLearner(const std::unordered_set<Parameter>& parameters,
|
||||
double gamma,
|
||||
double inc,
|
||||
double dec,
|
||||
double max,
|
||||
double min,
|
||||
bool needAveMultiplier = true);
|
||||
|
||||
///
|
||||
/// Trainer is the top-level abstraction responsible for the orchestration of the training of a model
|
||||
/// using the specified learners and training data either explicilty supplied as Value objects or from
|
||||
/// a MinibatchSource object.
|
||||
///
|
||||
class Trainer
|
||||
{
|
||||
public:
|
||||
///
|
||||
/// Construct a Trainer to train the specified 'model' with the specified 'trainingLoss' Variable as the training criterion
|
||||
/// and using the specified set of 'parameterLearners' for updating the model's parameters using computed gradients.
|
||||
///
|
||||
CNTK_API Trainer(const FunctionPtr& model, const Variable& trainingLoss, const std::unordered_set<LearnerPtr>& parameterLearners);
|
||||
|
||||
///
|
||||
/// Optimize model parameters using the specified 'arguments' minibatch of training samples.
|
||||
/// Returns false if all parameter learners indicate end of learning (through their Update method's return value).
|
||||
///
|
||||
CNTK_API bool TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice = DeviceDescriptor::DefaultDevice());
|
||||
|
||||
///
|
||||
/// Model being trained by 'this' Trainer.
|
||||
///
|
||||
FunctionPtr Model() const { return m_model; }
|
||||
|
||||
///
|
||||
/// Variable of the Trainer's model representing the training loss that is used as the optimization
|
||||
/// criterion for learning the model's parameters.
|
||||
///
|
||||
Variable TrainingLossVariable() const { return m_trainingLossVar; }
|
||||
|
||||
///
|
||||
/// Returns the Value of the training loss variable of the model corresponding to the last minibatch trained with
|
||||
///
|
||||
ValuePtr PreviousMinibatchTrainingLossValue() const { return m_prevMinibatchTrainingLossValue; }
|
||||
|
||||
///
|
||||
/// Learners associated with this Trainer for updating the model's parameters using computed gradients.
|
||||
///
|
||||
const std::unordered_set<LearnerPtr>& ParameterLearners() const { return m_parameterLearners; }
|
||||
|
||||
private:
|
||||
FunctionPtr m_model;
|
||||
Variable m_trainingLossVar;
|
||||
ValuePtr m_prevMinibatchTrainingLossValue;
|
||||
std::unordered_set<LearnerPtr> m_parameterLearners;
|
||||
};
|
||||
|
||||
///
|
||||
/// Describes an input stream: its name, element type, storage, etc.
|
||||
///
|
||||
struct StreamInfo
|
||||
{
|
||||
std::wstring m_name; // Unique name of the stream
|
||||
size_t m_id; // Unique identifier of the stream
|
||||
StorageFormat m_storageFormat; // Storage format of the stream
|
||||
DataType m_elementType; // Element type of the stream
|
||||
NDShape m_sampleLayout; // Layout of the sample for the stream
|
||||
};
|
||||
|
||||
inline bool operator==(const StreamInfo& left, const StreamInfo& right)
|
||||
{
|
||||
return ((left.m_id == right.m_id) &&
|
||||
(left.m_name == right.m_name) &&
|
||||
(left.m_storageFormat == right.m_storageFormat) &&
|
||||
(left.m_elementType == right.m_elementType) &&
|
||||
(left.m_sampleLayout == right.m_sampleLayout));
|
||||
}
|
||||
}
|
||||
|
||||
namespace std {
|
||||
template <> struct hash<CNTK::StreamInfo>
|
||||
{
|
||||
size_t operator()(const CNTK::StreamInfo& x) const
|
||||
{
|
||||
return std::hash<size_t>()(x.m_id);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
struct MinibatchData
|
||||
{
|
||||
size_t m_numSequences;
|
||||
size_t m_numSamples;
|
||||
ValuePtr m_data;
|
||||
};
|
||||
|
||||
///
|
||||
/// Abstraction for generating minbatches of samples for training/evaluation.
|
||||
///
|
||||
class MinibatchSource : public std::enable_shared_from_this<MinibatchSource>
|
||||
{
|
||||
public:
|
||||
///
|
||||
/// Describes the streams 'this' MinibatchSource produces.
|
||||
///
|
||||
virtual const std::unordered_set<StreamInfo>& StreamInfos() = 0;
|
||||
|
||||
///
|
||||
/// Reads a minibatch that contains data across all input streams.
|
||||
/// The minibatchData argument specifies the desired minibatch size for each stream of the reader either in terms of #sequences or
|
||||
/// #samples or both. In case the size is specified in terms of both #sequences and #samples, the smaller of the 2 is taken. The actual
|
||||
/// returned size of the minibatch is the min across all streams. Also the requested MB size fields in the maps are updated by the
|
||||
/// MinibatchSource to contain the actual #sequences and #samples in the returned minibatch for the corresponding stream.
|
||||
/// The return value indciates if the MinibatchSource will return any further data in subsequent calls of this function.
|
||||
///
|
||||
virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
|
||||
const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) = 0;
|
||||
|
||||
// TODO: Methods to save and restore from checkpoints
|
||||
|
||||
// Disallow copy and move construction and assignment
|
||||
MinibatchSource(const MinibatchSource&) = delete; MinibatchSource(MinibatchSource&&) = delete; MinibatchSource& operator=(const MinibatchSource&) = delete; MinibatchSource& operator=(MinibatchSource&&) = delete;
|
||||
|
||||
protected:
|
||||
MinibatchSource() {}
|
||||
};
|
||||
|
||||
///
|
||||
/// Instantiate the CNTK built-in composite minibatch source.
|
||||
///
|
||||
CNTK_API MinibatchSourcePtr CreateCompositeMinibatchSource(const Dictionary& configuration);
|
||||
|
||||
///
|
||||
/// Compute the per dimension means and variances for each of the specified streams using data from the specified minibatchSource.
|
||||
///
|
||||
CNTK_API void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
|
||||
std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndVariances,
|
||||
const DeviceDescriptor& device = DeviceDescriptor::CPUDevice());
|
||||
}
|
||||
|
|
|
@ -47,6 +47,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
template <typename ElementType>
|
||||
class ComputationNode;
|
||||
|
||||
class File;
|
||||
}}}
|
||||
|
||||
// TODO: The following should be reconciled with the equivalent code in the CNTK implementation
|
||||
|
@ -100,9 +102,15 @@ namespace CNTK
|
|||
|
||||
// RuntimeError - throw a std::runtime_error with a formatted error string
|
||||
#ifndef _MSC_VER // gcc __attribute__((format(printf())) does not percolate through variadic templates; so must go the macro route
|
||||
#ifndef RuntimeError
|
||||
#define RuntimeError ThrowFormatted<std::runtime_error>
|
||||
#endif
|
||||
#ifndef LogicError
|
||||
#define LogicError ThrowFormatted<std::logic_error>
|
||||
#endif
|
||||
#ifndef InvalidArgument
|
||||
#define InvalidArgument ThrowFormatted<std::invalid_argument>
|
||||
#endif
|
||||
#else
|
||||
template <class... _Types>
|
||||
__declspec_noreturn inline void RuntimeError(const char* format, _Types&&... _Args)
|
||||
|
@ -158,4 +166,12 @@ namespace CNTK
|
|||
|
||||
class Function;
|
||||
typedef std::shared_ptr<Function> FunctionPtr;
|
||||
|
||||
class Learner;
|
||||
typedef std::shared_ptr<Learner> LearnerPtr;
|
||||
|
||||
class Dictionary;
|
||||
|
||||
class MinibatchSource;
|
||||
typedef std::shared_ptr<MinibatchSource> MinibatchSourcePtr;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,274 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "CNTKLibrary.h"
|
||||
#include "Function.h"
|
||||
#include "ComputationNetworkBuilder.h"
|
||||
#include "Utils.h"
|
||||
#include "ComputationNode.h"
|
||||
#include "InputAndParamNodes.h"
|
||||
#include "NonlinearityNodes.h"
|
||||
#include "LinearAlgebraNodes.h"
|
||||
#include "RecurrentNodes.h"
|
||||
#include "EvaluationNodes.h"
|
||||
#include "TrainingNodes.h"
|
||||
|
||||
using namespace Microsoft::MSR::CNTK;
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
template <typename ElementType>
|
||||
Variable GetVariable(const ComputationNodeBasePtr& node,
|
||||
std::unordered_map<ComputationNodeBasePtr, Variable>& nodeToVariableMap,
|
||||
std::unordered_map<Placeholder, Variable>& placeholderReplacements,
|
||||
std::unordered_set<FunctionPtr>& allPrimitiveFunctions)
|
||||
{
|
||||
auto iter = nodeToVariableMap.find(node);
|
||||
if (iter != nodeToVariableMap.end())
|
||||
return iter->second;
|
||||
|
||||
Variable var;
|
||||
NDShape varShape = AsNDShape(node->GetSampleLayout());
|
||||
// The CNTK sample layouts may have trailing axes with dimension size of 1 which are automatically
|
||||
// added when converting from NDShape to CNTK internal TensorShapes and are not present in the original
|
||||
// shapes specified by the user. These should be truncated.
|
||||
if (varShape.NumAxes() <= 2)
|
||||
{
|
||||
size_t numTrailingDimsToRemove = 0;
|
||||
for (int i = varShape.NumAxes() - 1; i >= 0; --i)
|
||||
{
|
||||
if (varShape[i] == 1)
|
||||
numTrailingDimsToRemove++;
|
||||
else
|
||||
break;
|
||||
}
|
||||
varShape = varShape.SubShape(0, varShape.NumAxes() - numTrailingDimsToRemove);
|
||||
}
|
||||
|
||||
if (node->IsLeaf())
|
||||
{
|
||||
if (node->Is<InputValueBase<ElementType>>())
|
||||
{
|
||||
auto inputNode = node->As<InputValueBase<ElementType>>();
|
||||
bool isSparse = node->Is<SparseInputValue<ElementType>>();
|
||||
if (node->HasMBLayout())
|
||||
{
|
||||
// TODO: Currently only default dynamic axis is supported
|
||||
const std::wstring defaultCNTKDynamicAxisName = L"";
|
||||
if (inputNode->GetRequestedDynamicAxis() != defaultCNTKDynamicAxisName)
|
||||
LogicError("Found dynamic axis named '%S' while currently only default dynamic axis named '%S' is supported!", node->GetMBLayout()->GetAxisName(), defaultCNTKDynamicAxisName.c_str());
|
||||
|
||||
var = Variable(varShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, node->GetName());
|
||||
}
|
||||
else
|
||||
{
|
||||
// TODO: Allow creating inputs without a dynamic axis
|
||||
LogicError("Found InputNode with no dynamic axis which is currently unsupported");
|
||||
}
|
||||
}
|
||||
else if (node->Is<LearnableParameter<ElementType>>())
|
||||
{
|
||||
bool isConstant = (node->GetLearningRateMultiplier() == 0);
|
||||
auto& matrix = node->As<ComputationNode<ElementType>>()->Value();
|
||||
auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), node->GetSampleLayout());
|
||||
NDArrayViewPtr parameterValue = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), varShape, false, tensorView);
|
||||
if (isConstant)
|
||||
var = Constant(parameterValue, node->GetName());
|
||||
else
|
||||
var = Parameter(parameterValue, node->GetName());
|
||||
}
|
||||
else
|
||||
LogicError("CNTK::LoadLegacyModel: Unsupported legacy CNTK node named '%S'", node->NodeName().c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
// This is a non-leaf node and maps to a primitive Function
|
||||
auto placeholderVar = Placeholder(varShape);
|
||||
nodeToVariableMap[node] = placeholderVar;
|
||||
|
||||
std::vector<Variable> inputVars(node->GetNumInputs());
|
||||
for (size_t i = 0; i < inputVars.size(); ++i)
|
||||
{
|
||||
inputVars[i] = GetVariable<ElementType>(node->Input(i), nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions);
|
||||
if (inputVars[i].IsPlaceholder())
|
||||
placeholderReplacements[Placeholder(inputVars[i])] = Variable();
|
||||
}
|
||||
|
||||
PrimitiveOpType opType;
|
||||
Dictionary primitiveFunctionConfigParameters;
|
||||
if (node->OperationName() == OperationNameOf(NegateNode))
|
||||
opType = PrimitiveOpType::Negate;
|
||||
else if (node->OperationName() == OperationNameOf(SigmoidNode))
|
||||
opType = PrimitiveOpType::Sigmoid;
|
||||
else if (node->OperationName() == OperationNameOf(TanhNode))
|
||||
opType = PrimitiveOpType::Tanh;
|
||||
else if (node->OperationName() == OperationNameOf(RectifiedLinearNode))
|
||||
opType = PrimitiveOpType::ReLU;
|
||||
else if (node->OperationName() == OperationNameOf(ExpNode))
|
||||
opType = PrimitiveOpType::Exp;
|
||||
else if (node->OperationName() == OperationNameOf(LogNode))
|
||||
opType = PrimitiveOpType::Log;
|
||||
else if (node->OperationName() == OperationNameOf(SqrtNode))
|
||||
opType = PrimitiveOpType::Sqrt;
|
||||
else if (node->OperationName() == OperationNameOf(FloorNode))
|
||||
opType = PrimitiveOpType::Floor;
|
||||
else if (node->OperationName() == OperationNameOf(AbsNode))
|
||||
opType = PrimitiveOpType::Abs;
|
||||
else if (node->OperationName() == OperationNameOf(ReciprocalNode))
|
||||
opType = PrimitiveOpType::Reciprocal;
|
||||
else if (node->OperationName() == OperationNameOf(SoftmaxNode))
|
||||
opType = PrimitiveOpType::Softmax;
|
||||
else if (node->OperationName() == OperationNameOf(PlusNode))
|
||||
opType = PrimitiveOpType::Plus;
|
||||
else if (node->OperationName() == OperationNameOf(MinusNode))
|
||||
opType = PrimitiveOpType::Minus;
|
||||
else if (node->OperationName() == OperationNameOf(ElementTimesNode))
|
||||
opType = PrimitiveOpType::ElementTimes;
|
||||
else if (node->OperationName() == OperationNameOf(EqualNode))
|
||||
opType = PrimitiveOpType::Equal;
|
||||
else if (node->OperationName() == OperationNameOf(NotEqualNode))
|
||||
opType = PrimitiveOpType::NotEqual;
|
||||
else if (node->OperationName() == OperationNameOf(LessNode))
|
||||
opType = PrimitiveOpType::Less;
|
||||
else if (node->OperationName() == OperationNameOf(LessEqualNode))
|
||||
opType = PrimitiveOpType::LessEqual;
|
||||
else if (node->OperationName() == OperationNameOf(GreaterNode))
|
||||
opType = PrimitiveOpType::Greater;
|
||||
else if (node->OperationName() == OperationNameOf(GreaterEqualNode))
|
||||
opType = PrimitiveOpType::GreaterEqual;
|
||||
else if (node->OperationName() == OperationNameOf(TimesNode))
|
||||
{
|
||||
primitiveFunctionConfigParameters[L"numOutputAxes"] = DictionaryValue((size_t)node->As<TimesNode<ElementType>>()->OutputRank());
|
||||
opType = PrimitiveOpType::Times;
|
||||
}
|
||||
else if (node->OperationName() == OperationNameOf(PastValueNode))
|
||||
{
|
||||
if (inputVars.size() == 1)
|
||||
{
|
||||
auto initialStateVar = Constant({}, node->As<PastValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
|
||||
inputVars.insert(inputVars.begin(), initialStateVar);
|
||||
}
|
||||
primitiveFunctionConfigParameters[L"stepSize"] = DictionaryValue((size_t)node->As<PastValueNode<ElementType>>()->TimeStep());
|
||||
opType = PrimitiveOpType::PastValue;
|
||||
}
|
||||
else if (node->OperationName() == OperationNameOf(FutureValueNode))
|
||||
{
|
||||
if (inputVars.size() == 1)
|
||||
{
|
||||
auto initialStateVar = Constant({}, node->As<FutureValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
|
||||
inputVars.insert(inputVars.begin(), initialStateVar);
|
||||
}
|
||||
primitiveFunctionConfigParameters[L"stepSize"] = DictionaryValue((size_t)node->As<FutureValueNode<ElementType>>()->TimeStep());
|
||||
opType = PrimitiveOpType::FutureValue;
|
||||
}
|
||||
else if (node->OperationName() == OperationNameOf(SquareErrorNode))
|
||||
opType = PrimitiveOpType::SquaredError;
|
||||
else if (node->OperationName() == OperationNameOf(CrossEntropyWithSoftmaxNode))
|
||||
{
|
||||
std::swap(inputVars[0], inputVars[1]);
|
||||
opType = PrimitiveOpType::CrossEntropyWithSoftmax;
|
||||
}
|
||||
else if (node->OperationName() == OperationNameOf(ErrorPredictionNode))
|
||||
{
|
||||
std::swap(inputVars[0], inputVars[1]);
|
||||
opType = PrimitiveOpType::ClassificationError;
|
||||
}
|
||||
else if (node->OperationName() == OperationNameOf(SumElementsNode))
|
||||
opType = PrimitiveOpType::ReduceSum;
|
||||
else if (node->OperationName() == OperationNameOf(ConvolutionNode))
|
||||
{
|
||||
auto convolutionNode = node->As<ConvolutionNode<ElementType>>();
|
||||
primitiveFunctionConfigParameters[L"strides"] = AsNDShape(convolutionNode->Strides());
|
||||
primitiveFunctionConfigParameters[L"sharing"] = AsDictionaryValueVector(convolutionNode->Sharing());
|
||||
primitiveFunctionConfigParameters[L"autoPadding"] = AsDictionaryValueVector(convolutionNode->AutoPad());
|
||||
primitiveFunctionConfigParameters[L"lowerPad"] = AsNDShape(convolutionNode->LowerPad());
|
||||
primitiveFunctionConfigParameters[L"upperPad"] = AsNDShape(convolutionNode->UpperPad());
|
||||
primitiveFunctionConfigParameters[L"transpose"] = convolutionNode->Transpose();
|
||||
primitiveFunctionConfigParameters[L"maxTempMemSizeInSamples"] = convolutionNode->MaxTempMemSizeInSamples();
|
||||
|
||||
opType = PrimitiveOpType::Convolution;
|
||||
}
|
||||
else if (node->OperationName() == OperationNameOf(PoolingNode))
|
||||
{
|
||||
auto poolingNode = node->As<PoolingNode<ElementType>>();
|
||||
primitiveFunctionConfigParameters[L"poolingType"] = (size_t)(AsPoolingType(poolingNode->PoolingKind()));
|
||||
primitiveFunctionConfigParameters[L"poolingWindowShape"] = AsNDShape(poolingNode->KernelShape());
|
||||
primitiveFunctionConfigParameters[L"strides"] = AsNDShape(poolingNode->Strides());
|
||||
primitiveFunctionConfigParameters[L"autoPadding"] = AsDictionaryValueVector(poolingNode->AutoPad());
|
||||
primitiveFunctionConfigParameters[L"lowerPad"] = AsNDShape(poolingNode->LowerPad());
|
||||
primitiveFunctionConfigParameters[L"upperPad"] = AsNDShape(poolingNode->UpperPad());
|
||||
|
||||
opType = PrimitiveOpType::Pooling;
|
||||
}
|
||||
else if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
|
||||
{
|
||||
auto batchNormalizationNode = node->As<BatchNormalizationNode<ElementType>>();
|
||||
primitiveFunctionConfigParameters[L"spacial"] = batchNormalizationNode->Spatial();
|
||||
primitiveFunctionConfigParameters[L"normalizationTimeConstant"] = batchNormalizationNode->NormalizationTimeConstant();
|
||||
primitiveFunctionConfigParameters[L"blendTimeConstant"] = batchNormalizationNode->BlendTimeConstant();
|
||||
primitiveFunctionConfigParameters[L"epsilon"] = batchNormalizationNode->Epsilon();
|
||||
primitiveFunctionConfigParameters[L"useCuDNNEngine"] = !batchNormalizationNode->UseCNTKEngine();
|
||||
|
||||
opType = PrimitiveOpType::BatchNormalization;
|
||||
}
|
||||
else
|
||||
LogicError("Unsupported ComputationNode with OperationName='%S' found when loading legacy CNTK model", node->OperationName().c_str());
|
||||
|
||||
FunctionPtr primitiveFunction = MakeSharedObject<PrimitiveFunction>(opType, inputVars, std::move(primitiveFunctionConfigParameters), node->GetName());
|
||||
allPrimitiveFunctions.insert(primitiveFunction);
|
||||
var = primitiveFunction->Output();
|
||||
if (placeholderReplacements.find(placeholderVar) != placeholderReplacements.end())
|
||||
placeholderReplacements[placeholderVar] = var;
|
||||
}
|
||||
|
||||
nodeToVariableMap[node] = var;
|
||||
return var;
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
FunctionPtr LoadLegacyModel(const std::wstring& modelFile, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/)
|
||||
{
|
||||
ComputationNetworkPtr net = make_shared<ComputationNetwork>(AsCNTKImplDeviceId(computeDevice));
|
||||
net->Load<ElementType>(modelFile);
|
||||
|
||||
// Now traverse the model and construct the Function graph
|
||||
std::unordered_map<ComputationNodeBasePtr, Variable> nodeToVariableMap;
|
||||
std::unordered_map<Placeholder, Variable> placeholderReplacements;
|
||||
std::unordered_set<FunctionPtr> allPrimitiveFunctions;
|
||||
std::vector<FunctionPtr> rootFunctions;
|
||||
auto& networkRoots = net->RootNodes();
|
||||
for (auto& rootNode : networkRoots)
|
||||
{
|
||||
if (rootNode->IsLeaf())
|
||||
continue;
|
||||
|
||||
rootFunctions.push_back(GetVariable<ElementType>(rootNode, nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions).Owner());
|
||||
}
|
||||
|
||||
auto rootComposite = Combine(rootFunctions);
|
||||
rootComposite->ReplacePlaceholders(placeholderReplacements);
|
||||
|
||||
return rootComposite;
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
void SaveAsLegacyModel(const FunctionPtr& rootFunction, const std::wstring& modelFile)
|
||||
{
|
||||
CompositeFunction* compositeFunction = dynamic_cast<CompositeFunction*>(rootFunction.get());
|
||||
if (compositeFunction == nullptr)
|
||||
InvalidArgument("Primitive (aka non-composite) Function instances cannot be saved");
|
||||
|
||||
auto computationNetwork = compositeFunction->GetComputationNetwork<ElementType>(DeviceDescriptor::CPUDevice(), {});
|
||||
computationNetwork->Save(modelFile);
|
||||
}
|
||||
|
||||
// Template instantiations
|
||||
template CNTK_API FunctionPtr LoadLegacyModel<float>(const std::wstring& modelFile, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/);
|
||||
template CNTK_API FunctionPtr LoadLegacyModel<double>(const std::wstring& modelFile, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/);
|
||||
|
||||
template CNTK_API void SaveAsLegacyModel<float>(const FunctionPtr& rootFunction, const std::wstring& modelFile);
|
||||
template CNTK_API void SaveAsLegacyModel<double>(const FunctionPtr& rootFunction, const std::wstring& modelFile);
|
||||
}
|
|
@ -56,7 +56,7 @@
|
|||
</PropertyGroup>
|
||||
<ItemDefinitionGroup>
|
||||
<ClCompile>
|
||||
<AdditionalIncludeDirectories>.\API;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
|
||||
<AdditionalIncludeDirectories>.\API;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<AdditionalLibraryDirectories>$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(MSMPI_LIB64);$(SolutionDir)$(Platform)\$(Configuration);$(NvmlLibPath)</AdditionalLibraryDirectories>
|
||||
|
@ -75,7 +75,7 @@
|
|||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; SequenceTrainingLib.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; SequenceTrainingLib.lib; ReaderLib.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
|
||||
<DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
|
@ -99,7 +99,7 @@
|
|||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; ReaderLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
|
||||
<Profile>true</Profile>
|
||||
<DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
|
||||
</Link>
|
||||
|
@ -128,11 +128,14 @@
|
|||
<ClInclude Include="API\CNTKLibrary.h" />
|
||||
<ClInclude Include="API\CNTKLibraryInternals.h" />
|
||||
<ClInclude Include="Function.h" />
|
||||
<ClInclude Include="Learner.h" />
|
||||
<ClInclude Include="MinibatchSource.h" />
|
||||
<ClInclude Include="Utils.h" />
|
||||
<ClInclude Include="stdafx.h" />
|
||||
<ClInclude Include="targetver.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="BackCompat.cpp" />
|
||||
<ClCompile Include="Common.cpp" />
|
||||
<ClCompile Include="dllmain.cpp">
|
||||
<CompileAsManaged>false</CompileAsManaged>
|
||||
|
@ -140,11 +143,14 @@
|
|||
</PrecompiledHeader>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Function.cpp" />
|
||||
<ClCompile Include="Learner.cpp" />
|
||||
<ClCompile Include="MinibatchSource.cpp" />
|
||||
<ClCompile Include="NDArrayView.cpp" />
|
||||
<ClCompile Include="NDMask.cpp" />
|
||||
<ClCompile Include="stdafx.cpp">
|
||||
<PrecompiledHeader>Create</PrecompiledHeader>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Trainer.cpp" />
|
||||
<ClCompile Include="Utils.cpp" />
|
||||
<ClCompile Include="Value.cpp" />
|
||||
<ClCompile Include="Variable.cpp" />
|
||||
|
|
|
@ -10,6 +10,10 @@
|
|||
<ClCompile Include="Variable.cpp" />
|
||||
<ClCompile Include="Utils.cpp" />
|
||||
<ClCompile Include="NDMask.cpp" />
|
||||
<ClCompile Include="Learner.cpp" />
|
||||
<ClCompile Include="BackCompat.cpp" />
|
||||
<ClCompile Include="Trainer.cpp" />
|
||||
<ClCompile Include="MinibatchSource.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="stdafx.h" />
|
||||
|
@ -22,6 +26,8 @@
|
|||
<Filter>API</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Function.h" />
|
||||
<ClInclude Include="Learner.h" />
|
||||
<ClInclude Include="MinibatchSource.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Filter Include="API">
|
||||
|
|
|
@ -117,6 +117,7 @@ namespace CNTK
|
|||
if (variable.IsParameter() || variable.IsConstant())
|
||||
{
|
||||
computationNodePtr = builder.CreateLearnableParameter(variable.Name(), AsTensorShape(variable.Shape()));
|
||||
network->InitLearnableParameters(computationNodePtr, L"fixedValue", 0); // must call this to follow protocol; can overwrite later
|
||||
if (!variable.NeedsGradient())
|
||||
computationNodePtr->SetLearningRateMultiplier(0.0);
|
||||
|
||||
|
@ -126,7 +127,13 @@ namespace CNTK
|
|||
}
|
||||
else if (variable.IsInput())
|
||||
{
|
||||
// TODO: Specify dynamic axis
|
||||
// TODO: Support inputs with > 1 dynamic axes
|
||||
if (variable.DynamicAxes().size() != 1)
|
||||
LogicError("Currently only Input variables with one dynamic axis are supported");
|
||||
|
||||
auto dynamicAxis = variable.DynamicAxes()[0];
|
||||
if (dynamicAxis != Axis::DefaultDynamicAxis())
|
||||
LogicError("Currently only Input variables with DefaultDynamicAxis are supported");
|
||||
if (IsSparseInput(variable))
|
||||
computationNodePtr = builder.CreateSparseInputNode(variable.Name(), AsTensorShape(variable.Shape()));
|
||||
else
|
||||
|
@ -164,6 +171,7 @@ namespace CNTK
|
|||
if (dynamic_cast<PrimitiveFunction*>(function))
|
||||
{
|
||||
PrimitiveFunction* primitiveFunction = dynamic_cast<PrimitiveFunction*>(function);
|
||||
auto functionConfig = primitiveFunction->FunctionConfig();
|
||||
|
||||
// Create the nodes corresponding to the inputs
|
||||
auto functionInputs = primitiveFunction->Inputs();
|
||||
|
@ -180,12 +188,8 @@ namespace CNTK
|
|||
PrimitiveOpType op = primitiveFunction->OpType();
|
||||
switch (op)
|
||||
{
|
||||
case PrimitiveOpType::Plus:
|
||||
computationNodePtr = builder.Plus(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Times:
|
||||
// TODO: The output rank of the times operation is currently hardcoded to 1
|
||||
computationNodePtr = builder.Times(input0Node, input1Node, 1, function->Name());
|
||||
case PrimitiveOpType::Negate:
|
||||
computationNodePtr = builder.Negate(input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Sigmoid:
|
||||
computationNodePtr = builder.Sigmoid(input0Node, function->Name());
|
||||
|
@ -193,15 +197,100 @@ namespace CNTK
|
|||
case PrimitiveOpType::Tanh:
|
||||
computationNodePtr = builder.Tanh(input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::ReLU:
|
||||
computationNodePtr = builder.RectifiedLinear(input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Exp:
|
||||
computationNodePtr = builder.Exp(input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Log:
|
||||
computationNodePtr = builder.Log(input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Sqrt:
|
||||
computationNodePtr = builder.Sqrt(input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Floor:
|
||||
computationNodePtr = builder.Floor(input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Abs:
|
||||
computationNodePtr = builder.Abs(input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Reciprocal:
|
||||
computationNodePtr = builder.Reciprocal(input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Softmax:
|
||||
if (functionInputs[0].Shape().NumAxes() > 1)
|
||||
InvalidArgument("Softmax operation can only be applied to a 1D input");
|
||||
|
||||
computationNodePtr = builder.Softmax(input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Pooling:
|
||||
{
|
||||
PoolingType poolingType = (PoolingType)(functionConfig[L"poolingType"].GetValue<size_t>());
|
||||
auto poolingWindowsShape = functionConfig[L"poolingWindowShape"].GetValue<NDShape>();
|
||||
auto strides = functionConfig[L"strides"].GetValue<NDShape>();
|
||||
auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
|
||||
auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
|
||||
auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
|
||||
computationNodePtr = builder.Pooling(input0Node, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape, true), AsTensorShape(strides, true), autoPadding, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true), ImageLayoutKind::CHW, function->Name());
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Plus:
|
||||
computationNodePtr = builder.Plus(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Minus:
|
||||
computationNodePtr = builder.Minus(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::ElementTimes:
|
||||
computationNodePtr = builder.ElementTimes(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Equal:
|
||||
computationNodePtr = builder.Equal(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::NotEqual:
|
||||
computationNodePtr = builder.NotEqual(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Less:
|
||||
computationNodePtr = builder.Less(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::LessEqual:
|
||||
computationNodePtr = builder.LessEqual(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Greater:
|
||||
computationNodePtr = builder.Greater(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::GreaterEqual:
|
||||
computationNodePtr = builder.GreaterEqual(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Times:
|
||||
{
|
||||
size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
|
||||
computationNodePtr = builder.Times(input0Node, input1Node, numOutputAxes, function->Name());
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Convolution:
|
||||
{
|
||||
NDShape outputMapCount, kernelShape;
|
||||
std::tie(outputMapCount, kernelShape) = GetConvolutionOutputMapCountAndKernelShape(functionInputs[0].Shape(), functionInputs[1].Shape());
|
||||
auto strides = functionConfig[L"strides"].GetValue<NDShape>();
|
||||
auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
|
||||
auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
|
||||
auto sharing = AsBasicElementTypeVector<bool>(functionConfig[L"sharing"].GetValue<std::vector<DictionaryValue>>());
|
||||
auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
|
||||
auto transpose = functionConfig[L"transpose"].GetValue<bool>();
|
||||
auto maxTempMemSizeInSamples = functionConfig[L"maxTempMemSizeInSamples"].GetValue<size_t>();
|
||||
computationNodePtr = builder.Convolution(input0Node, input1Node, AsTensorShape(kernelShape, true), AsTensorShape(outputMapCount, true), AsTensorShape(strides, true), sharing, autoPadding, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true), transpose, ImageLayoutKind::CHW, maxTempMemSizeInSamples, function->Name());
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::SquaredError:
|
||||
computationNodePtr = builder.SquareError(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::CrossEntropyWithSoftmax:
|
||||
computationNodePtr = builder.CrossEntropyWithSoftmax(input1Node, input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::ClassificationError:
|
||||
computationNodePtr = builder.ErrorPrediction(input1Node, input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::Exp:
|
||||
computationNodePtr = builder.Exp(input0Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::PastValue:
|
||||
case PrimitiveOpType::FutureValue:
|
||||
{
|
||||
|
@ -231,9 +320,6 @@ namespace CNTK
|
|||
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::ElementTimes:
|
||||
computationNodePtr = builder.ElementTimes(input0Node, input1Node, function->Name());
|
||||
break;
|
||||
case PrimitiveOpType::ReduceSum:
|
||||
{
|
||||
// TODO: Use the new ReduceElements node instead of the legacy SumElements node for reduction. Currently ReduceElements has incorrect MBLayout inference.
|
||||
|
@ -241,6 +327,23 @@ namespace CNTK
|
|||
computationNodePtr = builder.Sum(input0Node, function->Name());
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::BatchNormalization:
|
||||
{
|
||||
auto spacial = functionConfig[L"spacial"].GetValue<bool>();
|
||||
auto normalizationTimeConstant = functionConfig[L"normalizationTimeConstant"].GetValue<double>();
|
||||
auto blendTimeConstant = functionConfig[L"blendTimeConstant"].GetValue<double>();
|
||||
auto epsilon = functionConfig[L"epsilon"].GetValue<double>();
|
||||
auto useCuDNNEngine = functionConfig[L"useCuDNNEngine"].GetValue<bool>();
|
||||
std::vector<std::shared_ptr<ComputationNode<ElementType>>> inputNodes;
|
||||
for (auto inputVar : functionInputs)
|
||||
{
|
||||
auto baseNodePtr = GetNode(inputVar, network, builder, variableToNodeMap, isVariableRootMap);
|
||||
inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr->template As<ComputationNode<ElementType>>()->shared_from_this() : nullptr);
|
||||
}
|
||||
|
||||
computationNodePtr = builder.BatchNormalization(inputNodes[0], inputNodes[1], inputNodes[2], inputNodes[3], inputNodes[4], spacial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, ImageLayoutKind::CHW, function->Name());
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Combine:
|
||||
// This operation is just a no-op and is a means to combine multiple functions to create a single Function
|
||||
// whose outputs are a union of tyhe outputs of the Functions being combined.
|
||||
|
@ -351,7 +454,7 @@ namespace CNTK
|
|||
auto outputShape = outputVar.Shape();
|
||||
auto computationNodeSampleLayout = computationNodePtr->GetSampleLayout();
|
||||
if (((outputShape.NumAxes() == 0) && (computationNodeSampleLayout[0] != 1)) ||
|
||||
((outputShape.NumAxes() != 0) && (computationNodeSampleLayout != AsTensorShape(outputShape))))
|
||||
((outputShape.NumAxes() != 0) && (computationNodeSampleLayout != AsTensorShape(outputShape)) && (computationNodeSampleLayout != AsTensorShape(outputShape, true))))
|
||||
{
|
||||
LogicError("The output Variable shape %s does not match the SampleLayout shape %s of the corresponding ComputationNode in the network", AsString(outputShape).c_str(), ((std::string)computationNodeSampleLayout).c_str());
|
||||
}
|
||||
|
@ -486,18 +589,9 @@ namespace CNTK
|
|||
}
|
||||
|
||||
template <typename ElementType>
|
||||
/*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout)
|
||||
/*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(const NDShape& sampleShape, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/)
|
||||
{
|
||||
if (var.DynamicAxes().size() > 1)
|
||||
LogicError("More than one dynamic axis for a variable is currently unsupported");
|
||||
|
||||
if (AsDataType<ElementType>() != var.GetDataType())
|
||||
LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(var.GetDataType()));
|
||||
|
||||
if ((layout != nullptr) && (matrix.GetNumRows() != var.Shape().TotalSize()))
|
||||
LogicError("Unexpected matrix layout: The number of rows in the matrix does not match the sample size of the Variable");
|
||||
|
||||
NDShape valueDataShape = var.Shape();
|
||||
NDShape valueDataShape = sampleShape;
|
||||
if (layout != nullptr)
|
||||
valueDataShape = valueDataShape.AppendShape({ layout->GetNumTimeSteps(), layout->GetNumSequences() });
|
||||
|
||||
|
@ -506,7 +600,7 @@ namespace CNTK
|
|||
{
|
||||
// Just create a view over the existing matrix itself
|
||||
auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), AsTensorShape(valueDataShape));
|
||||
auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, true, tensorView);
|
||||
auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, readOnly, tensorView);
|
||||
return MakeSharedObject<Value>(data);
|
||||
}
|
||||
|
||||
|
@ -565,10 +659,25 @@ namespace CNTK
|
|||
}
|
||||
|
||||
auto tensorView = new TensorView<ElementType>(shuffledMatrixData, AsTensorShape(valueDataShape));
|
||||
auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, true, tensorView);
|
||||
auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, readOnly, tensorView);
|
||||
return MakeSharedObject<Value>(data, mask);
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
/*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/)
|
||||
{
|
||||
if (var.DynamicAxes().size() > 1)
|
||||
LogicError("More than one dynamic axis for a variable is currently unsupported");
|
||||
|
||||
if (AsDataType<ElementType>() != var.GetDataType())
|
||||
LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(var.GetDataType()));
|
||||
|
||||
if ((layout != nullptr) && (matrix.GetNumRows() != var.Shape().TotalSize()))
|
||||
LogicError("Unexpected matrix layout: The number of rows in the matrix does not match the sample size of the Variable");
|
||||
|
||||
return GetValueObjectFromCNTKImplMatrixAndMBLayout(var.Shape(), matrix, layout, readOnly);
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
/*static*/ void CompositeFunction::PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, ComputationNodeBasePtr& computationNode)
|
||||
{
|
||||
|
@ -583,7 +692,7 @@ namespace CNTK
|
|||
computationNode->GetMBLayout()->CopyFrom(layout);
|
||||
}
|
||||
|
||||
void CompositeFunction::PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments)
|
||||
void CompositeFunction::PopulateNetworkInputs(const std::unordered_map<Variable, ValuePtr>& arguments)
|
||||
{
|
||||
auto functionArguments = this->Arguments();
|
||||
std::vector<ComputationNodeBasePtr> inputNodes;
|
||||
|
@ -628,7 +737,7 @@ namespace CNTK
|
|||
}
|
||||
|
||||
// Assign the supplied gradients corresponding to the root(s) of the network to be backpropagated through the graph
|
||||
void CompositeFunction::PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients)
|
||||
void CompositeFunction::PopulateNetworkGradients(const std::unordered_map<Variable, ValuePtr>& gradients)
|
||||
{
|
||||
auto functionOutputs = this->Outputs();
|
||||
for (auto gradientVarValuePair : gradients)
|
||||
|
@ -676,45 +785,48 @@ namespace CNTK
|
|||
return NDShape(outputShapeDims);
|
||||
}
|
||||
|
||||
/*static*/ void CompositeFunction::GetNodeOutputOrGradient(Variable var, ValuePtr& varValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode, bool getGradient)
|
||||
{
|
||||
auto valueShape = GetValueShape(var, computationNode);
|
||||
if (varValue != nullptr)
|
||||
{
|
||||
// TODO: The shape of the specified output Value object must match the actual output shape
|
||||
if (varValue->Data()->Shape() != valueShape)
|
||||
InvalidArgument("The shape %s of the specified Value object for %s does not match the actual shape %s", AsString(varValue->Data()->Shape()).c_str(), getGradient ? "gradient" : "output", AsString(valueShape).c_str());
|
||||
}
|
||||
|
||||
ValuePtr nodeValue;
|
||||
switch (var.GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(var,
|
||||
getGradient ? computationNode->As<ComputationNode<float>>()->Gradient() : computationNode->As<ComputationNode<float>>()->Value(),
|
||||
computationNode->GetMBLayout());
|
||||
break;
|
||||
case DataType::Double:
|
||||
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(var,
|
||||
getGradient ? computationNode->As<ComputationNode<double>>()->Gradient() : computationNode->As<ComputationNode<double>>()->Value(),
|
||||
computationNode->GetMBLayout());
|
||||
break;
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(var.GetDataType()));
|
||||
break;
|
||||
}
|
||||
|
||||
if (varValue == nullptr)
|
||||
{
|
||||
auto data = MakeSharedObject<NDArrayView>(var.GetDataType(), valueShape, AsDeviceDescriptor(computationNode->ValuePtr()->GetDeviceId()));
|
||||
auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
|
||||
varValue = MakeSharedObject<Value>(data, mask);
|
||||
}
|
||||
varValue->CopyFrom(*nodeValue);
|
||||
}
|
||||
|
||||
void CompositeFunction::GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs)
|
||||
{
|
||||
// Now copy the Forward values of output nodes from the network to outputs' Value objects
|
||||
for (auto outputVarValuePair : outputs)
|
||||
{
|
||||
auto computationNodePtr = m_variableToNodeMap[outputVarValuePair.first];
|
||||
auto outputValuePtr = outputVarValuePair.second;
|
||||
|
||||
auto outputShape = GetValueShape(outputVarValuePair.first, computationNodePtr);
|
||||
if (outputValuePtr != nullptr)
|
||||
{
|
||||
// TODO: The shape of the specified output Value object must match the actual output shape
|
||||
if (outputValuePtr->Data()->Shape() != outputShape)
|
||||
InvalidArgument("The shape %s of the specified Value object for output does not match the actual output shape %s", AsString(outputValuePtr->Data()->Shape()).c_str(), AsString(outputShape).c_str());
|
||||
}
|
||||
|
||||
ValuePtr nodeValue;
|
||||
switch (outputVarValuePair.first.GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Value(), computationNodePtr->GetMBLayout());
|
||||
break;
|
||||
case DataType::Double:
|
||||
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Value(), computationNodePtr->GetMBLayout());
|
||||
break;
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(outputVarValuePair.first.GetDataType()));
|
||||
break;
|
||||
}
|
||||
|
||||
if (outputValuePtr == nullptr)
|
||||
{
|
||||
auto data = MakeSharedObject<NDArrayView>(outputVarValuePair.first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
|
||||
auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
|
||||
outputValuePtr = MakeSharedObject<Value>(data, mask);
|
||||
}
|
||||
outputValuePtr->CopyFrom(*nodeValue);
|
||||
outputs[outputVarValuePair.first] = outputValuePtr;
|
||||
}
|
||||
GetNodeOutputOrGradient(outputVarValuePair.first, outputs[outputVarValuePair.first], m_variableToNodeMap[outputVarValuePair.first], false /*getGradient*/);
|
||||
}
|
||||
|
||||
void CompositeFunction::GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients)
|
||||
|
@ -732,46 +844,15 @@ namespace CNTK
|
|||
InvalidArgument("Gradient value incorrectly requested for an Output or Constant Variable, or an Input Variable with NeedsGradient setting of false");
|
||||
|
||||
auto computationNodePtr = m_variableToNodeMap[gradientVarValuePair.first];
|
||||
auto gradientValuePtr = gradientVarValuePair.second;
|
||||
|
||||
auto gradientShape = GetValueShape(gradientVarValuePair.first, computationNodePtr);
|
||||
if (gradientValuePtr != nullptr)
|
||||
{
|
||||
// TODO: The shape of the specified output Value object must match the actual output shape
|
||||
if (gradientValuePtr->Data()->Shape() != gradientShape)
|
||||
InvalidArgument("The shape %s of the specified Value object for gradient does not match the actual gradient shape %s", AsString(gradientValuePtr->Data()->Shape()).c_str(), AsString(gradientShape).c_str());
|
||||
}
|
||||
|
||||
if (!computationNodePtr->NeedsGradient())
|
||||
LogicError("Backpropagated gradient value cannot be read from a ComputationNode that has NeedsGradient set to false");
|
||||
|
||||
ValuePtr nodeValue;
|
||||
switch (gradientVarValuePair.first.GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Gradient(), computationNodePtr->GetMBLayout());
|
||||
break;
|
||||
case DataType::Double:
|
||||
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Gradient(), computationNodePtr->GetMBLayout());
|
||||
break;
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(gradientVarValuePair.first.GetDataType()));
|
||||
break;
|
||||
}
|
||||
|
||||
if (gradientValuePtr == nullptr)
|
||||
{
|
||||
auto data = MakeSharedObject<NDArrayView>(gradientVarValuePair.first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
|
||||
auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
|
||||
gradientValuePtr = MakeSharedObject<Value>(data, mask);
|
||||
}
|
||||
|
||||
gradientValuePtr->CopyFrom(*nodeValue);
|
||||
gradients[gradientVarValuePair.first] = gradientValuePtr;
|
||||
GetNodeOutputOrGradient(gradientVarValuePair.first, gradients[gradientVarValuePair.first], computationNodePtr, true /*getGradient*/);
|
||||
}
|
||||
}
|
||||
|
||||
/*virtual*/ BackPropStatePtr CompositeFunction::Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
|
||||
/*virtual*/ BackPropStatePtr CompositeFunction::Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
|
||||
std::unordered_map<Variable, ValuePtr>& outputs,
|
||||
const DeviceDescriptor& computeDevice,
|
||||
const std::unordered_set<Variable>& outputsToRetainBackwardStateFor)
|
||||
|
@ -809,6 +890,8 @@ namespace CNTK
|
|||
outputsToEvaluate.push_back(m_variableToNodeMap[rootVarForBackprop]);
|
||||
}
|
||||
|
||||
ScopedNetworkOperationMode modeGuard(m_computationNetwork, outputsToRetainBackwardStateFor.empty() ? NetworkOperationMode::inferring : NetworkOperationMode::training);
|
||||
|
||||
m_computationNetwork->ForwardProp(outputsToEvaluate);
|
||||
|
||||
GetNetworkOutputs(outputs);
|
||||
|
@ -819,7 +902,7 @@ namespace CNTK
|
|||
}
|
||||
|
||||
/*virtual*/ void CompositeFunction::Backward(const BackPropStatePtr& state,
|
||||
const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
|
||||
const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
|
||||
std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs)
|
||||
{
|
||||
auto backpropState = dynamic_cast<const CNTKBackPropState*>(state.get());
|
||||
|
@ -844,6 +927,8 @@ namespace CNTK
|
|||
PopulateNetworkGradients(rootGradientValues);
|
||||
|
||||
// Backpropagate through the network
|
||||
ScopedNetworkOperationMode modeGuard(m_computationNetwork, NetworkOperationMode::training);
|
||||
|
||||
auto rootComputationNodePtr = m_variableToNodeMap[rootGradientValues.begin()->first];
|
||||
m_computationNetwork->GetNestedNetwork(rootComputationNodePtr)->Backprop(FrameRange(nullptr), true, true);
|
||||
|
||||
|
@ -852,27 +937,261 @@ namespace CNTK
|
|||
// TODO: How to deal with the specified 'computeDevice'
|
||||
}
|
||||
|
||||
FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
FunctionPtr UnaryOp(PrimitiveOpType op, const Variable& operand, Dictionary&& opConfig, const std::wstring& name)
|
||||
{
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Times, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(op, std::vector<Variable>({ operand }), std::move(opConfig), name), name);
|
||||
}
|
||||
|
||||
FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
FunctionPtr Negate(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Plus, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
|
||||
return UnaryOp(PrimitiveOpType::Negate, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Sigmoid, std::vector<Variable>({ operand }), Dictionary(), name), name);
|
||||
return UnaryOp(PrimitiveOpType::Sigmoid, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Tanh(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Tanh, std::vector<Variable>({ operand }), Dictionary(), name), name);
|
||||
return UnaryOp(PrimitiveOpType::Tanh, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Combine(const std::initializer_list<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
|
||||
FunctionPtr ReLU(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return UnaryOp(PrimitiveOpType::ReLU, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Exp(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return UnaryOp(PrimitiveOpType::Exp, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Log(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return UnaryOp(PrimitiveOpType::Log, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Square(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return ElementTimes(operand, operand, name);
|
||||
}
|
||||
|
||||
FunctionPtr Sqrt(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return UnaryOp(PrimitiveOpType::Sqrt, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Round(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return Floor(Plus(operand, Constant(NDShape({}), 0.5f)), name);
|
||||
}
|
||||
|
||||
FunctionPtr Floor(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return UnaryOp(PrimitiveOpType::Floor, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Ceil(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return Negate(Floor(Negate(operand)), name);
|
||||
}
|
||||
|
||||
FunctionPtr Abs(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return UnaryOp(PrimitiveOpType::Abs, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Reciprocal(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return UnaryOp(PrimitiveOpType::Reciprocal, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Softmax(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return UnaryOp(PrimitiveOpType::Softmax, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr BinaryOp(PrimitiveOpType op, const Variable& leftOperand, const Variable& rightOperand, Dictionary&& opConfig, const std::wstring& name)
|
||||
{
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(op, std::vector<Variable>({ leftOperand, rightOperand }), std::move(opConfig), name), name);
|
||||
}
|
||||
|
||||
FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::Plus, leftOperand, rightOperand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Minus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::Minus, leftOperand, rightOperand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::ElementTimes, leftOperand, rightOperand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr ElementDivide(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return ElementTimes(leftOperand, Reciprocal(rightOperand), name);
|
||||
}
|
||||
|
||||
FunctionPtr Equal(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::Equal, leftOperand, rightOperand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr NotEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::NotEqual, leftOperand, rightOperand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Less(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::Less, leftOperand, rightOperand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr LessEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::LessEqual, leftOperand, rightOperand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Greater(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::Greater, leftOperand, rightOperand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr GreaterEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::GreaterEqual, leftOperand, rightOperand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes /*= 1*/, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
auto additionalProperties = Dictionary();
|
||||
additionalProperties[L"numOutputAxes"] = numOutputAxes;
|
||||
return BinaryOp(PrimitiveOpType::Times, leftOperand, rightOperand, std::move(additionalProperties), name);
|
||||
}
|
||||
|
||||
FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::SquaredError, prediction, targets, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr CrossEntropyWithSoftmax(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::CrossEntropyWithSoftmax, prediction, labels, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return BinaryOp(PrimitiveOpType::ClassificationError, prediction, labels, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
if (operand.DynamicAxes().size() != 1)
|
||||
InvalidArgument("PastValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
|
||||
|
||||
auto additionalProperties = Dictionary();
|
||||
additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
|
||||
return BinaryOp(PrimitiveOpType::PastValue, initialState, operand, std::move(additionalProperties), name);
|
||||
}
|
||||
|
||||
FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
if (operand.DynamicAxes().size() != 1)
|
||||
InvalidArgument("FutureValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
|
||||
|
||||
auto additionalProperties = Dictionary();
|
||||
additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
|
||||
return BinaryOp(PrimitiveOpType::FutureValue, initialState, operand, std::move(additionalProperties), name);
|
||||
}
|
||||
|
||||
FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return UnaryOp(PrimitiveOpType::ReduceSum, operand, Dictionary(), name);
|
||||
}
|
||||
|
||||
FunctionPtr PerDimMeanVarianceNormalize(const Variable& operand, const NDArrayViewPtr& mean, const NDArrayViewPtr& invStdDev, const std::wstring& name /*= L""*/)
|
||||
{
|
||||
Constant meanVar(mean);
|
||||
Constant invStdDevVar(invStdDev);
|
||||
|
||||
return ElementTimes(Minus(operand, meanVar), invStdDevVar);
|
||||
}
|
||||
|
||||
FunctionPtr Convolution(const Variable& convolutionMap,
|
||||
const Variable& operand,
|
||||
const NDShape& strides,
|
||||
const std::vector<bool>& sharing,
|
||||
const std::vector<bool>& autoPadding,
|
||||
const NDShape& lowerPad,
|
||||
const NDShape& upperPad,
|
||||
bool transpose,
|
||||
size_t maxTempMemSizeInSamples,
|
||||
const std::wstring& name)
|
||||
{
|
||||
auto additionalProperties = Dictionary();
|
||||
additionalProperties[L"strides"] = strides;
|
||||
additionalProperties[L"sharing"] = AsDictionaryValueVector(sharing);
|
||||
additionalProperties[L"autoPadding"] = AsDictionaryValueVector(autoPadding);
|
||||
additionalProperties[L"lowerPad"] = lowerPad;
|
||||
additionalProperties[L"upperPad"] = upperPad;
|
||||
additionalProperties[L"transpose"] = transpose;
|
||||
additionalProperties[L"maxTempMemSizeInSamples"] = maxTempMemSizeInSamples;
|
||||
|
||||
return BinaryOp(PrimitiveOpType::Convolution, convolutionMap, operand, std::move(additionalProperties), name);
|
||||
}
|
||||
|
||||
FunctionPtr Pooling(const Variable& operand,
|
||||
PoolingType poolingType,
|
||||
const NDShape& poolingWindowShape,
|
||||
const NDShape& strides,
|
||||
const std::vector<bool>& autoPadding,
|
||||
const NDShape& lowerPad,
|
||||
const NDShape& upperPad,
|
||||
const std::wstring& name)
|
||||
{
|
||||
auto additionalProperties = Dictionary();
|
||||
additionalProperties[L"poolingType"] = (size_t)poolingType;
|
||||
additionalProperties[L"poolingWindowShape"] = poolingWindowShape;
|
||||
additionalProperties[L"strides"] = strides;
|
||||
additionalProperties[L"autoPadding"] = AsDictionaryValueVector(autoPadding);
|
||||
additionalProperties[L"lowerPad"] = lowerPad;
|
||||
additionalProperties[L"upperPad"] = upperPad;
|
||||
|
||||
return UnaryOp(PrimitiveOpType::Pooling, operand, std::move(additionalProperties), name);
|
||||
}
|
||||
|
||||
FunctionPtr BatchNormalization(const Variable& operand,
|
||||
const Variable& scale,
|
||||
const Variable& bias,
|
||||
const Variable& runningMean,
|
||||
const Variable& runningInvStd,
|
||||
bool spacial,
|
||||
double normalizationTimeConstant,
|
||||
double blendTimeConstant,
|
||||
double epsilon,
|
||||
bool useCuDNNEngine,
|
||||
const std::wstring& name)
|
||||
{
|
||||
auto additionalProperties = Dictionary();
|
||||
additionalProperties[L"spacial"] = spacial;
|
||||
additionalProperties[L"normalizationTimeConstant"] = normalizationTimeConstant;
|
||||
additionalProperties[L"blendTimeConstant"] = blendTimeConstant;
|
||||
additionalProperties[L"epsilon"] = epsilon;
|
||||
additionalProperties[L"useCuDNNEngine"] = useCuDNNEngine;
|
||||
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::BatchNormalization,
|
||||
std::vector<Variable>({ operand, scale, bias, runningMean, runningInvStd }),
|
||||
std::move(additionalProperties),
|
||||
name),
|
||||
name);
|
||||
}
|
||||
|
||||
FunctionPtr Combine(const std::vector<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
std::unordered_set<FunctionPtr> uniqueOperands;
|
||||
std::vector<Variable> inputs;
|
||||
|
@ -888,49 +1207,4 @@ namespace CNTK
|
|||
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Combine, inputs, Dictionary(), name), name);
|
||||
}
|
||||
|
||||
FunctionPtr CrossEntropyWithSoftmax(const Variable& output, const Variable& labels, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::CrossEntropyWithSoftmax, std::vector<Variable>({ output, labels }), Dictionary(), name), name);
|
||||
}
|
||||
|
||||
FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ClassificationError, std::vector<Variable>({ prediction, labels }), Dictionary(), name), name);
|
||||
}
|
||||
|
||||
FunctionPtr Exp(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Exp, std::vector<Variable>({ operand }), Dictionary(), name), name);
|
||||
}
|
||||
|
||||
FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
if (operand.DynamicAxes().size() != 1)
|
||||
InvalidArgument("PastValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
|
||||
|
||||
auto additionalProperties = Dictionary();
|
||||
additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::PastValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
|
||||
}
|
||||
|
||||
FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
if (operand.DynamicAxes().size() != 1)
|
||||
InvalidArgument("FutureValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
|
||||
|
||||
auto additionalProperties = Dictionary();
|
||||
additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::FutureValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
|
||||
}
|
||||
|
||||
FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ElementTimes, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
|
||||
}
|
||||
|
||||
FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name/* = L""*/)
|
||||
{
|
||||
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ReduceSum, std::vector<Variable>({ operand }), Dictionary(), name), name);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,65 +10,110 @@
|
|||
#include <iterator>
|
||||
#include "ComputationNetwork.h"
|
||||
#include "Utils.h"
|
||||
#include "ConvolveGeometry.h"
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
enum class PrimitiveOpType
|
||||
enum class PrimitiveOpType : unsigned int
|
||||
{
|
||||
Plus,
|
||||
Times,
|
||||
Negate,
|
||||
Sigmoid,
|
||||
Tanh,
|
||||
Combine,
|
||||
ReLU,
|
||||
Exp,
|
||||
Log,
|
||||
Sqrt,
|
||||
Floor,
|
||||
Abs,
|
||||
Reciprocal,
|
||||
Softmax,
|
||||
Pooling,
|
||||
Plus,
|
||||
Minus,
|
||||
ElementTimes,
|
||||
Equal,
|
||||
NotEqual,
|
||||
Less,
|
||||
LessEqual,
|
||||
Greater,
|
||||
GreaterEqual,
|
||||
Times,
|
||||
Convolution,
|
||||
SquaredError,
|
||||
CrossEntropyWithSoftmax,
|
||||
ClassificationError,
|
||||
Exp,
|
||||
PastValue,
|
||||
FutureValue,
|
||||
ElementTimes,
|
||||
ReduceSum
|
||||
ReduceSum,
|
||||
BatchNormalization,
|
||||
Combine,
|
||||
};
|
||||
}
|
||||
|
||||
namespace std
|
||||
{
|
||||
template <> struct hash<CNTK::PrimitiveOpType>
|
||||
{
|
||||
size_t operator()(const CNTK::PrimitiveOpType& x) const
|
||||
{
|
||||
return std::hash<unsigned int>()((unsigned int)x);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
inline const char* PrimitiveOpTypeName(PrimitiveOpType opType)
|
||||
{
|
||||
// TODO: Put these in table form
|
||||
if (opType == PrimitiveOpType::Plus)
|
||||
return "Plus";
|
||||
else if (opType == PrimitiveOpType::Times)
|
||||
return "Times";
|
||||
else if (opType == PrimitiveOpType::Sigmoid)
|
||||
return "Sigmoid";
|
||||
else if (opType == PrimitiveOpType::Tanh)
|
||||
return "Tanh";
|
||||
else if (opType == PrimitiveOpType::Combine)
|
||||
return "Combine";
|
||||
else if (opType == PrimitiveOpType::CrossEntropyWithSoftmax)
|
||||
return "CrossEntropyWithSoftmax";
|
||||
else if (opType == PrimitiveOpType::ClassificationError)
|
||||
return "ClassificationError";
|
||||
else if (opType == PrimitiveOpType::Exp)
|
||||
return "Exp";
|
||||
else if (opType == PrimitiveOpType::PastValue)
|
||||
return "PastValue";
|
||||
else if (opType == PrimitiveOpType::FutureValue)
|
||||
return "FutureValue";
|
||||
else if (opType == PrimitiveOpType::ElementTimes)
|
||||
return "ElementTimes";
|
||||
else if (opType == PrimitiveOpType::ReduceSum)
|
||||
return "ReduceSum";
|
||||
else
|
||||
static std::unordered_map<PrimitiveOpType, const char*> primitiveOpNames = {
|
||||
{ PrimitiveOpType::Negate, "Negate" },
|
||||
{ PrimitiveOpType::Sigmoid, "Sigmoid" },
|
||||
{ PrimitiveOpType::Tanh, "Tanh" },
|
||||
{ PrimitiveOpType::ReLU, "ReLU" },
|
||||
{ PrimitiveOpType::Exp, "Exp" },
|
||||
{ PrimitiveOpType::Log, "Log" },
|
||||
{ PrimitiveOpType::Sqrt, "Sqrt" },
|
||||
{ PrimitiveOpType::Floor, "Floor" },
|
||||
{ PrimitiveOpType::Abs, "Abs" },
|
||||
{ PrimitiveOpType::Reciprocal, "Reciprocal" },
|
||||
{ PrimitiveOpType::Softmax, "Softmax" },
|
||||
{ PrimitiveOpType::Pooling, "Pooling" },
|
||||
{ PrimitiveOpType::Plus, "Plus" },
|
||||
{ PrimitiveOpType::Minus, "Minus" },
|
||||
{ PrimitiveOpType::ElementTimes, "ElementTimes" },
|
||||
{ PrimitiveOpType::Equal, "Equal" },
|
||||
{ PrimitiveOpType::NotEqual, "NotEqual" },
|
||||
{ PrimitiveOpType::Less, "Less" },
|
||||
{ PrimitiveOpType::LessEqual, "LessEqual" },
|
||||
{ PrimitiveOpType::Greater, "Greater" },
|
||||
{ PrimitiveOpType::GreaterEqual, "GreaterEqual" },
|
||||
{ PrimitiveOpType::Times, "Times" },
|
||||
{ PrimitiveOpType::Convolution, "Convolution" },
|
||||
{ PrimitiveOpType::SquaredError, "SquaredError" },
|
||||
{ PrimitiveOpType::CrossEntropyWithSoftmax, "CrossEntropyWithSoftmax" },
|
||||
{ PrimitiveOpType::ClassificationError, "ClassificationError" },
|
||||
{ PrimitiveOpType::PastValue, "PastValue" },
|
||||
{ PrimitiveOpType::FutureValue, "FutureValue" },
|
||||
{ PrimitiveOpType::ReduceSum, "ReduceSum" },
|
||||
{ PrimitiveOpType::BatchNormalization, "BatchNormalization" },
|
||||
{ PrimitiveOpType::Combine, "Combine" }
|
||||
};
|
||||
|
||||
if (primitiveOpNames.find(opType) == primitiveOpNames.end())
|
||||
LogicError("Unknown PrimitiveOpType");
|
||||
|
||||
return primitiveOpNames.find(opType)->second;
|
||||
}
|
||||
|
||||
class PrimitiveFunction final : public Function
|
||||
{
|
||||
public:
|
||||
PrimitiveFunction(PrimitiveOpType op, const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& functionName = L"")
|
||||
: Function(inputs, GetOutputVariables(op, inputs, this), nullptr, functionName), m_op(op), m_functionConfig(std::move(functionConfig))
|
||||
: Function(inputs, GetOutputVariables(op, inputs, this, functionConfig), nullptr, functionName), m_op(op), m_functionConfig(std::move(functionConfig))
|
||||
{
|
||||
}
|
||||
|
||||
virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& /*arguments*/,
|
||||
virtual BackPropStatePtr Forward(const std::unordered_map<Variable, ValuePtr>& /*arguments*/,
|
||||
std::unordered_map<Variable, ValuePtr>& /*outputs*/,
|
||||
const DeviceDescriptor& /*computeDevice*/,
|
||||
const std::unordered_set<Variable>& /*outputsToRetainBackwardStateFor*/) override
|
||||
|
@ -77,7 +122,7 @@ namespace CNTK
|
|||
}
|
||||
|
||||
virtual void Backward(const BackPropStatePtr& /*state*/,
|
||||
const std::unordered_map<Variable, const ValuePtr>& /*rootGradientValues*/,
|
||||
const std::unordered_map<Variable, ValuePtr>& /*rootGradientValues*/,
|
||||
std::unordered_map<Variable, ValuePtr>& /*backPropagatedGradientValuesForInputs*/) override
|
||||
{
|
||||
NOT_IMPLEMENTED;
|
||||
|
@ -131,25 +176,28 @@ namespace CNTK
|
|||
return NDShape(std::move(outputDims));
|
||||
}
|
||||
|
||||
static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape)
|
||||
static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape, size_t numOutputAxes)
|
||||
{
|
||||
if (rightOperandShape.NumAxes() > 2)
|
||||
RuntimeError("The right operand of a times operation can have at most 2 axes");
|
||||
if (numOutputAxes == 0)
|
||||
InvalidArgument("Output #axes of times operation should be at least one");
|
||||
|
||||
size_t numOutputAxes = rightOperandShape.NumAxes();
|
||||
if (numOutputAxes > leftOperandShape.NumAxes())
|
||||
InvalidArgument("Output #axes of times operation can at most be the #axes of the left operand");
|
||||
|
||||
if (leftOperandShape.NumAxes() != 2)
|
||||
RuntimeError("The left operand of a times operation must have 2 axes");
|
||||
size_t numReductionAxes = leftOperandShape.NumAxes() - numOutputAxes;
|
||||
|
||||
std::vector<size_t> outputDims(numOutputAxes);
|
||||
outputDims[0] = leftOperandShape[0];
|
||||
if (numOutputAxes > 1)
|
||||
outputDims[1] = rightOperandShape[1];
|
||||
// The 'numReductionAxes' trailing dimensions of the left operand's shape must match the corresponding leading
|
||||
// dimensions of the right operand
|
||||
|
||||
if (leftOperandShape[1] != rightOperandShape[0])
|
||||
RuntimeError("Left operand's shape %s is not compatible with right operand's shape %s for the times operation", AsString(leftOperandShape).c_str(), AsString(rightOperandShape).c_str());
|
||||
if (rightOperandShape.NumAxes() != numReductionAxes)
|
||||
RuntimeError("The right operand's #axes in a times operation should equal #axes being reduced over!");
|
||||
|
||||
return NDShape(std::move(outputDims));
|
||||
if (leftOperandShape.SubShape(numOutputAxes) != rightOperandShape)
|
||||
InvalidArgument("The trailing dimensions of the left operand (%s) do not match the right operand's dimensions (%s)",
|
||||
AsString(leftOperandShape.SubShape(numOutputAxes)).c_str(),
|
||||
AsString(rightOperandShape).c_str());
|
||||
|
||||
return leftOperandShape.SubShape(0, numOutputAxes);
|
||||
}
|
||||
|
||||
static NDShape ReductionOpOutputShape(PrimitiveOpType op, const NDShape& operandShape, const std::vector<size_t>& reductionAxes)
|
||||
|
@ -171,8 +219,22 @@ namespace CNTK
|
|||
return NDShape(std::move(outputDims));
|
||||
}
|
||||
|
||||
static NDShape ConvolutionOpOutputShape(const NDShape& operandShape, const NDShape& kernelShape, const NDShape& outputMapCount, const NDShape& strides,
|
||||
const std::vector<bool>& sharing,
|
||||
std::vector<bool>& autoPad, const NDShape& lowerPad, const NDShape& upperPad,
|
||||
bool transpose)
|
||||
{
|
||||
decltype(&Microsoft::MSR::CNTK::ConvolveGeometry::ComputeOutputShape) computeOutputShapeFunc;
|
||||
if (!transpose)
|
||||
computeOutputShapeFunc = &Microsoft::MSR::CNTK::ConvolveGeometry::ComputeOutputShape;
|
||||
else
|
||||
computeOutputShapeFunc = &Microsoft::MSR::CNTK::ConvolveGeometry::ComputeInputShape;
|
||||
|
||||
return AsNDShape(computeOutputShapeFunc(AsTensorShape(operandShape, true), AsTensorShape(kernelShape, true), AsTensorShape(outputMapCount, true), AsTensorShape(strides, true), sharing, autoPad, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true)));
|
||||
}
|
||||
|
||||
// TODO: Reconcile this with the ComputationNode::Validate functionality in core CNTK to avoid duplication of inference logic
|
||||
static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner)
|
||||
static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner, const Dictionary& functionConfig)
|
||||
{
|
||||
std::vector<Variable> outputs;
|
||||
|
||||
|
@ -195,32 +257,79 @@ namespace CNTK
|
|||
|
||||
switch (op)
|
||||
{
|
||||
case PrimitiveOpType::Negate:
|
||||
case PrimitiveOpType::Sigmoid:
|
||||
case PrimitiveOpType::Tanh:
|
||||
case PrimitiveOpType::ReLU:
|
||||
case PrimitiveOpType::Exp:
|
||||
case PrimitiveOpType::Log:
|
||||
case PrimitiveOpType::Sqrt:
|
||||
case PrimitiveOpType::Floor:
|
||||
case PrimitiveOpType::Abs:
|
||||
case PrimitiveOpType::Reciprocal:
|
||||
case PrimitiveOpType::Softmax:
|
||||
assert(inputs.size() == 1);
|
||||
outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, outputDynamicAxes));
|
||||
break;
|
||||
case PrimitiveOpType::PastValue:
|
||||
case PrimitiveOpType::FutureValue:
|
||||
assert(inputs.size() == 2);
|
||||
outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
|
||||
case PrimitiveOpType::Pooling:
|
||||
{
|
||||
assert(inputs.size() == 1);
|
||||
auto poolingWindowsShape = functionConfig[L"poolingWindowShape"].GetValue<NDShape>();
|
||||
auto strides = functionConfig[L"strides"].GetValue<NDShape>();
|
||||
auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
|
||||
auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
|
||||
auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
|
||||
outputs.push_back(Variable(ConvolutionOpOutputShape(inputs[0].Shape(), poolingWindowsShape, { 1 }, strides, { true }, autoPadding, lowerPad, upperPad, false), outputDataType, owner, outputDynamicAxes));
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Plus:
|
||||
case PrimitiveOpType::Minus:
|
||||
case PrimitiveOpType::ElementTimes:
|
||||
case PrimitiveOpType::Equal:
|
||||
case PrimitiveOpType::NotEqual:
|
||||
case PrimitiveOpType::Less:
|
||||
case PrimitiveOpType::LessEqual:
|
||||
case PrimitiveOpType::Greater:
|
||||
case PrimitiveOpType::GreaterEqual:
|
||||
assert(inputs.size() == 2);
|
||||
outputs.push_back(Variable(BinaryElementwiseOpOutputShape(op, inputs[0].Shape(), inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
|
||||
break;
|
||||
case PrimitiveOpType::Times:
|
||||
{
|
||||
assert(inputs.size() == 2);
|
||||
outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
|
||||
|
||||
// TODO: Support dynamic axes on the left operand
|
||||
if (!inputs[0].DynamicAxes().empty())
|
||||
LogicError("Dynamic axes are currently unsupported for left operand of a Times operation");
|
||||
|
||||
size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
|
||||
outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape(), numOutputAxes), outputDataType, owner, outputDynamicAxes));
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::Convolution:
|
||||
{
|
||||
assert(inputs.size() == 2);
|
||||
auto strides = functionConfig[L"strides"].GetValue<NDShape>();
|
||||
auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
|
||||
auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
|
||||
auto sharing = AsBasicElementTypeVector<bool>(functionConfig[L"sharing"].GetValue<std::vector<DictionaryValue>>());
|
||||
auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
|
||||
bool transpose = functionConfig[L"transpose"].GetValue<bool>();
|
||||
if (inputs[0].Shape().NumAxes() < inputs[1].Shape().NumAxes())
|
||||
InvalidArgument("The convolution map should have at least as many axes as the shape of the input it operates on!");
|
||||
|
||||
NDShape outputMapCount, kernelShape;
|
||||
std::tie(outputMapCount, kernelShape) = GetConvolutionOutputMapCountAndKernelShape(inputs[0].Shape(), inputs[1].Shape());
|
||||
outputs.push_back(Variable(ConvolutionOpOutputShape(inputs[1].Shape(), kernelShape, outputMapCount, strides, sharing, autoPadding, lowerPad, upperPad, transpose), outputDataType, owner, outputDynamicAxes));
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::SquaredError:
|
||||
case PrimitiveOpType::CrossEntropyWithSoftmax:
|
||||
case PrimitiveOpType::ClassificationError:
|
||||
{
|
||||
assert(inputs.size() == 2);
|
||||
|
||||
if (inputs[0].Shape().NumAxes() > 1)
|
||||
if ((inputs[0].Shape().NumAxes() > 2) || ((inputs[0].Shape().NumAxes() > 1) && (inputs[0].Shape()[1] != 1)))
|
||||
InvalidArgument("The shape of input operands for the %s operation should have at most one axis", PrimitiveOpTypeName(op));
|
||||
|
||||
auto predictionShape = inputs[0].Shape();
|
||||
|
@ -235,6 +344,11 @@ namespace CNTK
|
|||
outputs.push_back(Variable(ReductionOpOutputShape(op, predictionShape, reductionAxes), outputDataType, owner, {}));
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::PastValue:
|
||||
case PrimitiveOpType::FutureValue:
|
||||
assert(inputs.size() == 2);
|
||||
outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
|
||||
break;
|
||||
case PrimitiveOpType::ReduceSum:
|
||||
{
|
||||
assert(inputs.size() == 1);
|
||||
|
@ -249,6 +363,9 @@ namespace CNTK
|
|||
outputs.push_back(Variable(ReductionOpOutputShape(op, inputs[0].Shape(), reductionAxes), outputDataType, owner, reductionOutputDynamicAxes));
|
||||
break;
|
||||
}
|
||||
case PrimitiveOpType::BatchNormalization:
|
||||
outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, outputDynamicAxes));
|
||||
break;
|
||||
case PrimitiveOpType::Combine:
|
||||
outputs = inputs;
|
||||
break;
|
||||
|
@ -288,10 +405,18 @@ namespace CNTK
|
|||
class CompositeFunction final : public Function
|
||||
{
|
||||
friend class Function;
|
||||
friend class CompositeMinibatchSource;
|
||||
|
||||
template <typename T, typename ...CtorArgTypes>
|
||||
friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
|
||||
|
||||
template <typename ElementType>
|
||||
friend void SaveAsLegacyModel(const FunctionPtr& rootFunction, const std::wstring& modelFile);
|
||||
|
||||
friend void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
|
||||
std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndInvStdDevs,
|
||||
const DeviceDescriptor& device /*= DeviceDescriptor::CPUDevice()*/);
|
||||
|
||||
public:
|
||||
static CompositeFunctionPtr Create(const FunctionPtr& rootFunction, const std::wstring& name = L"")
|
||||
{
|
||||
|
@ -303,13 +428,13 @@ namespace CNTK
|
|||
return MakeSharedObject<CompositeFunction>(rootFunction, std::move(visitedFunctions), name);
|
||||
}
|
||||
|
||||
virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
|
||||
virtual BackPropStatePtr Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
|
||||
std::unordered_map<Variable, ValuePtr>& outputs,
|
||||
const DeviceDescriptor& computeDevice,
|
||||
const std::unordered_set<Variable>& outputsToRetainBackwardStateFor) override;
|
||||
|
||||
virtual void Backward(const BackPropStatePtr& state,
|
||||
const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
|
||||
const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
|
||||
std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override;
|
||||
|
||||
private:
|
||||
|
@ -361,12 +486,13 @@ namespace CNTK
|
|||
|
||||
template <typename ElementType>
|
||||
static void PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
|
||||
void PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments);
|
||||
void PopulateNetworkInputs(const std::unordered_map<Variable, ValuePtr>& arguments);
|
||||
|
||||
template <typename ElementType>
|
||||
static void PopulateComputationNodeGradient(const std::pair<Variable, ValuePtr>& variableGradient, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
|
||||
void PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients);
|
||||
void PopulateNetworkGradients(const std::unordered_map<Variable, ValuePtr>& gradients);
|
||||
|
||||
static void GetNodeOutputOrGradient(Variable var, ValuePtr& varValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode, bool getGradient);
|
||||
void GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs);
|
||||
void GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients);
|
||||
|
||||
|
@ -374,7 +500,9 @@ namespace CNTK
|
|||
static std::pair<std::shared_ptr<const Microsoft::MSR::CNTK::Matrix<ElementType>>, Microsoft::MSR::CNTK::MBLayoutPtr> GetCNTKImplMatrixAndMBLayoutFromValueObject(Variable var, const ValuePtr& value);
|
||||
|
||||
template <typename ElementType>
|
||||
static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout);
|
||||
static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(const NDShape& sampleShape, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout, bool readOnly = true);
|
||||
template <typename ElementType>
|
||||
static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout, bool readOnly = true);
|
||||
|
||||
private:
|
||||
|
||||
|
|
|
@ -0,0 +1,451 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#include "Learner.h"
|
||||
#include "TensorView.h"
|
||||
#include "Utils.h"
|
||||
|
||||
#define UPDATE_FUNCTION \
|
||||
switch (smoothedGradientValue->GetDataType()) \
|
||||
{ \
|
||||
case DataType::Float: \
|
||||
Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
|
||||
break; \
|
||||
case DataType::Double: \
|
||||
Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
|
||||
break; \
|
||||
default: \
|
||||
NOT_IMPLEMENTED; \
|
||||
}
|
||||
|
||||
|
||||
using namespace Microsoft::MSR::CNTK;
|
||||
using namespace std;
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
template <typename ElementType>
|
||||
/*static*/ shared_ptr<const Matrix<ElementType>> LearnerBase::GetMatrix(const NDArrayViewPtr& arrayView)
|
||||
{
|
||||
return arrayView->GetMatrix<ElementType>();
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
/*static*/ shared_ptr<Matrix<ElementType>> LearnerBase::GetWritableMatrix(const NDArrayViewPtr& arrayView)
|
||||
{
|
||||
return arrayView->GetWritableMatrix<ElementType>();
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
/*static*/ const TensorView<ElementType>* LearnerBase::GetTensorView(const NDArrayViewPtr& arrayView)
|
||||
{
|
||||
return arrayView->GetTensorView<ElementType>();
|
||||
}
|
||||
|
||||
/*static*/ bool LearnerBase::HasNan(const NDArrayViewPtr& value, const char* name)
|
||||
{
|
||||
switch (value->GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
return value->GetMatrix<float>()->HasNan(name);
|
||||
case DataType::Double:
|
||||
return value->GetMatrix<double>()->HasNan(name);
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(value->GetDataType()));
|
||||
}
|
||||
}
|
||||
|
||||
/*static*/ void LearnerBase::Print(const NDArrayViewPtr& value, const char* msg)
|
||||
{
|
||||
switch (value->GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
value->GetMatrix<float>()->Print(msg);
|
||||
break;
|
||||
case DataType::Double:
|
||||
value->GetMatrix<double>()->Print(msg);
|
||||
break;
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(value->GetDataType()));
|
||||
}
|
||||
}
|
||||
|
||||
// Clipping gradients to prevent outliers,
|
||||
template <typename ElementType>
|
||||
void LearnerBase::ClipGradient(Matrix<ElementType>& gradient, size_t actualMBSize) const
|
||||
{
|
||||
if (m_additionalOptions.gradientClippingThresholdPerSample != numeric_limits<double>::infinity())
|
||||
{
|
||||
double maxGradientPerMB = m_additionalOptions.gradientClippingThresholdPerSample * actualMBSize;
|
||||
if (m_additionalOptions.gradientClippingWithTruncation)
|
||||
gradient.InplaceTruncate(ElementType(maxGradientPerMB));
|
||||
else
|
||||
{
|
||||
// norm2 normalized
|
||||
double gradientNorm = gradient.FrobeniusNorm();
|
||||
if (gradientNorm > maxGradientPerMB)
|
||||
{
|
||||
double normFactor = maxGradientPerMB / gradientNorm;
|
||||
gradient *= ElementType(normFactor);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Performs additional preprocessing before calling the update method
|
||||
// (gradient clipping and L2 regularization depending on the additional learning parameters).
|
||||
template <typename ElementType>
|
||||
void LearnerBase::PreProcess(const NDArrayViewPtr& parameterValue, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const
|
||||
{
|
||||
const auto& gradientMatrix = gradientValue->GetWritableMatrix<ElementType>();
|
||||
|
||||
// clipping gradients to prevent outliers
|
||||
ClipGradient<ElementType>(*gradientMatrix, actualMBSize);
|
||||
|
||||
// L2 regularizer
|
||||
if (m_additionalOptions.l2RegularizationWeight > 0)
|
||||
{
|
||||
// multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
|
||||
auto weight = ElementType(m_additionalOptions.l2RegularizationWeight * actualMBSize);
|
||||
const auto& parameterMatrix = parameterValue->GetWritableMatrix<ElementType>();
|
||||
Matrix<ElementType>::ScaleAndAdd(weight, *parameterMatrix, *gradientMatrix);
|
||||
}
|
||||
}
|
||||
|
||||
// Performs additional postprocessing after the update method has been executed
|
||||
// (noise injection and L1 regularization specified by the additional learning parameters).
|
||||
template <typename ElementType>
|
||||
void LearnerBase::PostProcess(const Parameter& parameter, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const
|
||||
{
|
||||
const auto& parameterValue = parameter.Value();
|
||||
const auto& parameterMatrix = parameterValue->GetWritableMatrix<ElementType>();
|
||||
if (m_additionalOptions.gaussianNoiseInjectionStdDev > 0)
|
||||
{
|
||||
const auto& gradientMatrix = gradientValue->GetWritableMatrix<ElementType>();
|
||||
|
||||
Matrix<ElementType> sgdUpdateNoise((DEVICEID_TYPE)parameterMatrix->GetDeviceId());
|
||||
|
||||
// get the gradient structure since gradient is sparse
|
||||
sgdUpdateNoise.SetValue(*gradientMatrix);
|
||||
|
||||
auto noiseStdDev = ElementType(m_additionalOptions.gaussianNoiseInjectionStdDev);
|
||||
|
||||
// reset its value to random
|
||||
sgdUpdateNoise.SetGaussianRandomValue(ElementType(0.0), noiseStdDev);
|
||||
|
||||
Matrix<ElementType>::ScaleAndAdd(ElementType(1.0), sgdUpdateNoise, *parameterMatrix);
|
||||
}
|
||||
|
||||
// L1 regularizer with proximal gradient descent method
|
||||
if (m_additionalOptions.l1RegularizationWeight > 0)
|
||||
{
|
||||
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
|
||||
// multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
|
||||
auto weight = ElementType(learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize);
|
||||
parameterValue->GetWritableMatrix<ElementType>()->InplaceSoftThreshold(weight);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
/*static*/ TensorView<ElementType>* LearnerBase::GetWritableTensorView(const NDArrayViewPtr& arrayView)
|
||||
{
|
||||
return arrayView->GetWritableTensorView<ElementType>();
|
||||
}
|
||||
|
||||
LearnerBase::LearnerBase(const unordered_set<Parameter>& parameters)
|
||||
: Learner(parameters),
|
||||
m_learningRatePerSample(0.0),
|
||||
m_sampleCount(0)
|
||||
{
|
||||
const unordered_set<Parameter>& parameterSet = parameters;
|
||||
for (const auto& parameter : parameterSet)
|
||||
{
|
||||
// TODO: using the same device to allocate data for all smoothed gradients. Is this correct?
|
||||
// Should the device be specified on the per-parameter basis?
|
||||
NDArrayViewPtr view;
|
||||
if (parameter.GetDataType() == DataType::Float)
|
||||
{
|
||||
view = MakeSharedObject<NDArrayView>(0.0f, parameter.Shape(), parameter.Value()->Device());
|
||||
}
|
||||
else
|
||||
{
|
||||
view = MakeSharedObject<NDArrayView>(0.0, parameter.Shape(), parameter.Value()->Device());
|
||||
}
|
||||
|
||||
m_smoothedGradientValues.insert(make_pair(parameter, view));
|
||||
m_additionalOptions.learningRateMultipliers.insert(make_pair(parameter, 1.0));
|
||||
}
|
||||
}
|
||||
|
||||
void LearnerBase::ResetSmoothedGradients()
|
||||
{
|
||||
for (const auto& parameter : Parameters())
|
||||
{
|
||||
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
|
||||
const auto& data = smoothedGradientValue;
|
||||
switch (data->GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
data->SetValue(0.0f);
|
||||
break;
|
||||
case DataType::Double:
|
||||
data->SetValue(0.0);
|
||||
break;
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", ::CNTK::DataTypeName(data->GetDataType()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*virtual*/ bool LearnerBase::Update(const unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) /*override*/
|
||||
{
|
||||
// make sure trainingSampleCount is a valid value
|
||||
assert(trainingSampleCount > 0);
|
||||
|
||||
for (const auto& parameter : Parameters())
|
||||
{
|
||||
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
|
||||
const auto& gradientValue = gradientValues.at(parameter);
|
||||
// TODO: make this a runtime parameter.
|
||||
#if DUMPOUTPUT
|
||||
LOGPRINTF(stderr, "Update_%ls\n", parameter.Name().c_str());
|
||||
#endif
|
||||
|
||||
#ifdef _DEBUG
|
||||
if (HasNan(smoothedGradientValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
|
||||
LogicError("%ls has NaNs in smoothedGradient.", parameter.Name().c_str());
|
||||
#endif
|
||||
|
||||
#if DUMPOUTPUT
|
||||
LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
|
||||
m_learningRatePerSample, m_momentumPerSample, trainingSampleCount);
|
||||
LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n",
|
||||
LearnerType().c_str(), m_GaussianNoiseInjectStd);
|
||||
Print(gradientValue, "Gradient Update");
|
||||
Print(smoothedGradientValue, "Smoothed Gradient Input");
|
||||
#endif
|
||||
UPDATE_FUNCTION;
|
||||
|
||||
#if DUMPOUTPUT
|
||||
Print(parameterValue, "Parameter Update");
|
||||
#endif
|
||||
|
||||
#ifdef _DEBUG
|
||||
const auto& parameterValue = parameter.Value();
|
||||
if (HasNan(parameterValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
|
||||
LogicError("%ls has NaNs in parameter values after parameter update.", parameter.Name().c_str());
|
||||
#endif
|
||||
}
|
||||
m_sampleCount += trainingSampleCount;
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
void LearnerBase::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
|
||||
{
|
||||
const auto& parameterValue = parameter.Value();
|
||||
PreProcess<ElementType>(parameterValue, gradientValue, trainingSampleCount);
|
||||
Update(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
|
||||
PostProcess<ElementType>(parameter, gradientValue, trainingSampleCount);
|
||||
}
|
||||
|
||||
string LearnerBase::LearnerType() const
|
||||
{
|
||||
auto name = typeid(*this).name();
|
||||
if (strncmp(name, "class ", 6) == 0)
|
||||
{
|
||||
// On Windows, the type name contains "class" prefix.
|
||||
// Return the actual name, omitting the prefix.
|
||||
return &name[6];
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
/*virtual*/ Dictionary LearnerBase::GetCheckpointState() const /*override*/
|
||||
{
|
||||
NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
|
||||
Dictionary checkpoint;
|
||||
|
||||
for (const auto& parameter : Parameters())
|
||||
{
|
||||
// TODO: parameter name is not guaranteed to be unique. Instead, all serializable objects
|
||||
// need to expose "UId" property -- a persistent unique internal name.
|
||||
// Switch to UId as soon as it's available.
|
||||
if (checkpoint.Contains(parameter.Name()))
|
||||
{
|
||||
LogicError("Parameter names must be unique");
|
||||
}
|
||||
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
|
||||
|
||||
// Potentially, could store things like dimensions, element size, format, etc., but
|
||||
// that seems to be redundant, since all of that is passed in the constructor.
|
||||
checkpoint[parameter.Name()] = SerializeToVector(smoothedGradientValue);
|
||||
}
|
||||
return checkpoint;
|
||||
}
|
||||
|
||||
/*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/
|
||||
{
|
||||
NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
|
||||
for (const auto& parameter : Parameters())
|
||||
{
|
||||
if (!checkpoint.Contains(parameter.Name()))
|
||||
{
|
||||
LogicError("Checkpoint does not contain state for parameter %ls", parameter.Name().c_str());
|
||||
}
|
||||
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
|
||||
|
||||
const DictionaryValue& state = checkpoint[parameter.Name()];
|
||||
|
||||
const auto& data = smoothedGradientValue;
|
||||
|
||||
DeserializeFromVector(data, state.GetValue<vector<DictionaryValue>>());
|
||||
}
|
||||
}
|
||||
|
||||
/*virtual*/ void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
|
||||
{
|
||||
UPDATE_FUNCTION;
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
|
||||
{
|
||||
UNUSED(trainingSampleCount);
|
||||
|
||||
const auto& parameterValue = parameter.Value();
|
||||
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
|
||||
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
|
||||
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
|
||||
|
||||
const auto& learningRate = ElementType(ParameterDependentLearningRate(parameter));
|
||||
|
||||
// TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
|
||||
// (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
|
||||
smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
|
||||
learningRate, ElementType(m_momentumPerSample), m_useNesterovAcceleration);
|
||||
}
|
||||
|
||||
LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
|
||||
: LearnerBase(parameters), m_needAveMultiplier(needAveMultiplier)
|
||||
{
|
||||
}
|
||||
|
||||
/*virtual*/ void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
|
||||
{
|
||||
UPDATE_FUNCTION;
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
|
||||
{
|
||||
UNUSED(trainingSampleCount);
|
||||
|
||||
const auto& parameterValue = parameter.Value();
|
||||
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
|
||||
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
|
||||
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
|
||||
|
||||
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
|
||||
|
||||
auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier);
|
||||
Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
|
||||
}
|
||||
|
||||
LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Parameter>& parameters)
|
||||
: LearnerMomentumSGD(parameters)
|
||||
{
|
||||
}
|
||||
|
||||
/*virtual*/ void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
|
||||
{
|
||||
UPDATE_FUNCTION;
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
|
||||
{
|
||||
UNUSED(trainingSampleCount);
|
||||
|
||||
const auto& parameterValue = parameter.Value();
|
||||
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
|
||||
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
|
||||
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
|
||||
|
||||
//const double momentum = MomentumPerMB(m_momentumPerSample, trainingSampleCount);
|
||||
|
||||
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
|
||||
|
||||
smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix,
|
||||
learningRate, ElementType(m_momentumPerSample));
|
||||
}
|
||||
|
||||
LearnerRMSProp::LearnerRMSProp(const unordered_set<Parameter>& parameters,
|
||||
double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
|
||||
: LearnerBase(parameters),
|
||||
m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
|
||||
m_needAveMultiplier(needAveMultiplier)
|
||||
{
|
||||
}
|
||||
|
||||
/*virtual*/ void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
|
||||
{
|
||||
UPDATE_FUNCTION;
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
|
||||
{
|
||||
UNUSED(trainingSampleCount);
|
||||
|
||||
const auto& parameterValue = parameter.Value();
|
||||
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
|
||||
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
|
||||
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
|
||||
|
||||
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
|
||||
|
||||
auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix,
|
||||
ElementType(m_gamma), ElementType(m_inc),
|
||||
ElementType(m_max), ElementType(m_dec),
|
||||
ElementType(m_min), m_needAveMultiplier);
|
||||
Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
|
||||
}
|
||||
|
||||
// Explicit template instantiations
|
||||
template shared_ptr<Matrix<float>> LearnerBase::GetWritableMatrix<float>(const NDArrayViewPtr& arrayView);
|
||||
template shared_ptr<Matrix<double>> LearnerBase::GetWritableMatrix<double>(const NDArrayViewPtr& arrayView);
|
||||
|
||||
LearnerPtr SGDLearner(const unordered_set<Parameter>& parameters, double learningRatePerSample)
|
||||
{
|
||||
return MakeSharedObject<LearnerSGD>(parameters, learningRatePerSample);
|
||||
}
|
||||
|
||||
LearnerPtr MomentumSGDLearner(const unordered_set<Parameter>& parameters)
|
||||
{
|
||||
return MakeSharedObject<LearnerMomentumSGD>(parameters);
|
||||
}
|
||||
|
||||
LearnerPtr NesterovLearner(const unordered_set<Parameter>& parameters)
|
||||
{
|
||||
return MakeSharedObject<LearnerNesterov>(parameters);
|
||||
}
|
||||
|
||||
LearnerPtr AdaGradLearner(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
|
||||
{
|
||||
return MakeSharedObject<LearnerAdaGrad>(parameters, needAveMultiplier);
|
||||
}
|
||||
|
||||
LearnerPtr FSAdaGradLearner(const unordered_set<Parameter>& parameters)
|
||||
{
|
||||
return MakeSharedObject<LearnerFSAdaGrad>(parameters);
|
||||
}
|
||||
|
||||
LearnerPtr RMSPropLearner(const unordered_set<Parameter>& parameters,
|
||||
double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
|
||||
{
|
||||
return MakeSharedObject<LearnerRMSProp>(parameters, gamma, inc, dec, max, min, needAveMultiplier);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "CNTKLibrary.h"
|
||||
#include <numeric>
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
// A collection of additional options that are applicable for all standard learners
|
||||
// (after these options are set, they retain their value for the entire lifespan of a learner).
|
||||
struct AdditionalLearningOptions
|
||||
{
|
||||
double l1RegularizationWeight = 0.0;
|
||||
double l2RegularizationWeight = 0.0;
|
||||
double gaussianNoiseInjectionStdDev = 0.0;
|
||||
bool gradientClippingWithTruncation = true;
|
||||
double gradientClippingThresholdPerSample = std::numeric_limits<double>::infinity();
|
||||
std::unordered_map<Parameter, double> learningRateMultipliers;
|
||||
};
|
||||
|
||||
// An abstract base class at the root of the standard learners hierarchy
|
||||
// It implements most of the learner functionality, except for the actual update function,
|
||||
// and adds a few pre-/postprocessing methods (which are invoked before and after the update).
|
||||
class LearnerBase : public Learner
|
||||
{
|
||||
public:
|
||||
virtual bool Update(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) override final;
|
||||
|
||||
virtual Dictionary GetCheckpointState() const override final;
|
||||
|
||||
virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override final;
|
||||
|
||||
void SetAdditionalOptions(const AdditionalLearningOptions& additionalOptions)
|
||||
{
|
||||
m_additionalOptions = additionalOptions;
|
||||
}
|
||||
|
||||
// TODO: should this be called ResetMomentum?
|
||||
// needed for BlockMomemtumSGD to reset SGD momentum after aggregation.
|
||||
void ResetSmoothedGradients();
|
||||
|
||||
// TODO: move learning rate and momentum scheduling and adjustment functionality
|
||||
// inside the learner and drop these setters.
|
||||
void SetLearningRate(double value) { m_learningRatePerSample = value; }
|
||||
|
||||
protected:
|
||||
LearnerBase(const std::unordered_set<Parameter>& parameters);
|
||||
|
||||
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const = 0;
|
||||
|
||||
double ParameterDependentLearningRate(const Parameter& parameter) const
|
||||
{
|
||||
return m_learningRatePerSample * m_additionalOptions.learningRateMultipliers.at(parameter);
|
||||
}
|
||||
|
||||
std::string LearnerType() const;
|
||||
|
||||
double m_learningRatePerSample;
|
||||
|
||||
AdditionalLearningOptions m_additionalOptions;
|
||||
|
||||
std::unordered_map<Parameter, NDArrayViewPtr> m_smoothedGradientValues;
|
||||
|
||||
// The following four static protected methods expose private methods of NDArrayView class
|
||||
// (which declares LearnerBase as friend class), so that they are available to subclasses.
|
||||
template <typename ElementType>
|
||||
static std::shared_ptr<const Microsoft::MSR::CNTK::Matrix<ElementType>> GetMatrix(const NDArrayViewPtr& arrayView);
|
||||
|
||||
template <typename ElementType>
|
||||
static std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetWritableMatrix(const NDArrayViewPtr& arrayView);
|
||||
|
||||
template <typename ElementType>
|
||||
static const Microsoft::MSR::CNTK::TensorView<ElementType>* GetTensorView(const NDArrayViewPtr& arrayView);
|
||||
|
||||
template <typename ElementType>
|
||||
static Microsoft::MSR::CNTK::TensorView<ElementType>* GetWritableTensorView(const NDArrayViewPtr& arrayView);
|
||||
|
||||
template <typename ElementType>
|
||||
void ClipGradient(Microsoft::MSR::CNTK::Matrix<ElementType>& gradient, size_t actualMBSize) const;
|
||||
|
||||
// Performs additional preprocessing before calling the update method
|
||||
// (gradient clipping and L2 regularization depending on the additional learning parameters).
|
||||
template <typename ElementType>
|
||||
void PreProcess(const NDArrayViewPtr& parameterValue, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const;
|
||||
|
||||
// Performs additional postprocessing after the update method has been executed
|
||||
// (noise injection and L1 regularization specified by the additional learning parameters).
|
||||
template <typename ElementType>
|
||||
void PostProcess(const Parameter& parameter, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const;
|
||||
|
||||
private:
|
||||
// Templatized update function, it invokes preprocess and postprocess using the provided
|
||||
// template parameter and also invokes virtual Update method implemented in one of the subclasses.
|
||||
template <typename ElementType>
|
||||
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
|
||||
|
||||
// TODO: make these functions friends of NDViewArray and move to Utils?
|
||||
static bool HasNan(const NDArrayViewPtr& value, const char* name);
|
||||
static void Print(const NDArrayViewPtr& value, const char* msg);
|
||||
|
||||
size_t m_sampleCount;
|
||||
};
|
||||
|
||||
// Vanilla gradient descent optimization algorithm.
|
||||
class LearnerSGD : public LearnerBase
|
||||
{
|
||||
public:
|
||||
LearnerSGD(const std::unordered_set<Parameter>& parameters, double learningRatePerSample = 0)
|
||||
: LearnerBase(parameters), m_momentumPerSample(0.0), m_useNesterovAcceleration(false)
|
||||
{
|
||||
SetLearningRate(learningRatePerSample);
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
|
||||
|
||||
template <typename ElementType>
|
||||
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
|
||||
|
||||
double m_momentumPerSample;
|
||||
bool m_useNesterovAcceleration;
|
||||
};
|
||||
|
||||
// SGD optimization with momentum.
|
||||
class LearnerMomentumSGD : public LearnerSGD
|
||||
{
|
||||
public:
|
||||
LearnerMomentumSGD(const std::unordered_set<Parameter>& parameters)
|
||||
: LearnerSGD(parameters)
|
||||
{}
|
||||
|
||||
void SetMomentum(double value) { m_momentumPerSample = value; }
|
||||
};
|
||||
|
||||
// Nesterov's accelerated SGDLearnerBase descent.
|
||||
class LearnerNesterov : public LearnerSGD
|
||||
{
|
||||
public:
|
||||
|
||||
LearnerNesterov(const std::unordered_set<Parameter>& parameters)
|
||||
: LearnerSGD(parameters)
|
||||
{
|
||||
m_useNesterovAcceleration = true;
|
||||
}
|
||||
};
|
||||
|
||||
class LearnerAdaGrad : public LearnerBase
|
||||
{
|
||||
public:
|
||||
|
||||
LearnerAdaGrad(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier);
|
||||
|
||||
protected:
|
||||
bool m_needAveMultiplier;
|
||||
|
||||
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
|
||||
|
||||
template <typename ElementType>
|
||||
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
|
||||
};
|
||||
|
||||
class LearnerFSAdaGrad : public LearnerMomentumSGD
|
||||
{
|
||||
public:
|
||||
|
||||
LearnerFSAdaGrad(const std::unordered_set<Parameter>& parameters);
|
||||
|
||||
protected:
|
||||
|
||||
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
|
||||
|
||||
template <typename ElementType>
|
||||
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
|
||||
};
|
||||
|
||||
class LearnerRMSProp : public LearnerBase
|
||||
{
|
||||
public:
|
||||
|
||||
LearnerRMSProp(const std::unordered_set<Parameter>& parameters,
|
||||
double gamma, double inc, double dec, double max, double min, bool needAveMultiplier);
|
||||
|
||||
protected:
|
||||
|
||||
double m_gamma;
|
||||
double m_inc;
|
||||
double m_dec;
|
||||
double m_max;
|
||||
double m_min;
|
||||
bool m_needAveMultiplier;
|
||||
|
||||
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
|
||||
|
||||
template <typename ElementType>
|
||||
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
|
||||
};
|
||||
}
|
|
@ -0,0 +1,246 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "CNTKLibrary.h"
|
||||
#include "Utils.h"
|
||||
#include "Config.h"
|
||||
#include "MinibatchSource.h"
|
||||
#include "HeapMemoryProvider.h"
|
||||
#include "ReaderShim.h"
|
||||
#include "Function.h"
|
||||
#include <tuple>
|
||||
#include "ComputationNetworkBuilder.h"
|
||||
|
||||
using namespace Microsoft::MSR::CNTK;
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
MinibatchSourcePtr CreateCompositeMinibatchSource(const Dictionary& configuration)
|
||||
{
|
||||
return MinibatchSourcePtr(new CompositeMinibatchSource(configuration));
|
||||
}
|
||||
|
||||
CompositeMinibatchSource::CompositeMinibatchSource(const Dictionary& configuration)
|
||||
: m_epochEndReached(false), m_prevMinibatchSize(0), m_epochSize(SIZE_MAX)
|
||||
{
|
||||
ConfigParameters config;
|
||||
std::wstringstream s;
|
||||
for (const auto& keyValuePair : *(configuration.m_dictionaryData))
|
||||
AddConfigString(s, keyValuePair.first, keyValuePair.second, 0);
|
||||
|
||||
config.Parse(msra::strfun::utf8(s.str()));
|
||||
|
||||
const wchar_t* epochSizeConfigurationKey = L"epochSize";
|
||||
if (configuration.Contains(epochSizeConfigurationKey))
|
||||
m_epochSize = configuration[epochSizeConfigurationKey].GetValue<size_t>();
|
||||
|
||||
if (m_epochSize == 0)
|
||||
m_epochSize = Microsoft::MSR::CNTK::requestDataSize;
|
||||
|
||||
typedef Reader*(*CreateCompositeDataReaderProc)(const ConfigParameters* parameters);
|
||||
CreateCompositeDataReaderProc createReaderProc = (CreateCompositeDataReaderProc)Plugin().Load(L"CompositeDataReader", "CreateCompositeDataReader");
|
||||
m_compositeDataReader.reset(createReaderProc(&config));
|
||||
|
||||
auto compositeDataReaderStreamDescs = m_compositeDataReader->GetStreamDescriptions();
|
||||
for (auto streamDesc : compositeDataReaderStreamDescs)
|
||||
m_streamInfos.insert({ streamDesc->m_name, streamDesc->m_id, AsStorageFormat(streamDesc->m_storageType), AsDataType(streamDesc->m_elementType), AsNDShape(*(streamDesc->m_sampleLayout)) });
|
||||
}
|
||||
|
||||
/*virtual*/ std::unordered_map<StreamInfo, MinibatchData> CompositeMinibatchSource::GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
|
||||
const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/) /*override*/
|
||||
{
|
||||
std::unordered_map<StreamInfo, MinibatchData> minibatchData;
|
||||
if (!m_epochEndReached)
|
||||
{
|
||||
// TODO: Support different minibatch sizes for different streams
|
||||
size_t requestedMinibatchSizeInSamples = 0;
|
||||
for (const auto& val : perStreamMBSizeLimits)
|
||||
{
|
||||
size_t maxNumSequencesRequested = val.second.first;
|
||||
size_t maxNumSamplesRequested = val.second.second;
|
||||
|
||||
// TODO: Specifying minibatch size in #sequences is currently unsupported
|
||||
if (maxNumSequencesRequested != 0)
|
||||
LogicError("Specifying minibatch size in #sequences is currently unsupported");
|
||||
|
||||
if (requestedMinibatchSizeInSamples == 0)
|
||||
requestedMinibatchSizeInSamples = maxNumSamplesRequested;
|
||||
else
|
||||
{
|
||||
if (requestedMinibatchSizeInSamples != maxNumSamplesRequested)
|
||||
LogicError("Different minibatch sizes across different input streams is currently unsupported!");
|
||||
}
|
||||
}
|
||||
|
||||
if (requestedMinibatchSizeInSamples == 0)
|
||||
InvalidArgument("GetNextMinibatch: Requested minibatch sizes must be > 0");
|
||||
|
||||
if (m_prevMinibatchSize == 0)
|
||||
{
|
||||
// TODO: Add support for distributed reading
|
||||
EpochConfiguration epochConfig = { 1, 0, requestedMinibatchSizeInSamples, m_epochSize, 0, 0 };
|
||||
m_compositeDataReader->StartEpoch(epochConfig);
|
||||
m_prevMinibatchSize = requestedMinibatchSizeInSamples;
|
||||
}
|
||||
|
||||
if (requestedMinibatchSizeInSamples != m_prevMinibatchSize)
|
||||
LogicError("GetNextMinibatch: Changing minibatch sizes across calls is currently unsupported");
|
||||
|
||||
auto compositeReaderMinibatchData = m_compositeDataReader->ReadMinibatch();
|
||||
m_epochEndReached = compositeReaderMinibatchData.m_endOfEpoch;
|
||||
|
||||
auto compositeDataReaderStreamDescs = m_compositeDataReader->GetStreamDescriptions();
|
||||
size_t numStreams = compositeDataReaderStreamDescs.size();
|
||||
for (size_t i = 0; i < numStreams; ++i)
|
||||
{
|
||||
auto currentStreamDesc = compositeDataReaderStreamDescs[i];
|
||||
auto iter = std::find_if(perStreamMBSizeLimits.begin(), perStreamMBSizeLimits.end(), [currentStreamDesc](const std::pair<StreamInfo, std::pair<size_t, size_t>>& entry) {
|
||||
return entry.first.m_id == currentStreamDesc->m_id;
|
||||
});
|
||||
|
||||
if (iter == perStreamMBSizeLimits.end())
|
||||
continue;
|
||||
|
||||
auto& currentStreamInfo = iter->first;
|
||||
auto sampleShape = AsNDShape(*(currentStreamDesc->m_sampleLayout));
|
||||
|
||||
ValuePtr minibatchValuePtr;
|
||||
if (compositeReaderMinibatchData.m_data.empty())
|
||||
{
|
||||
minibatchValuePtr = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(currentStreamInfo.m_elementType, sampleShape.AppendShape({ 0, 0 }), DeviceDescriptor::CPUDevice()));
|
||||
continue;
|
||||
}
|
||||
|
||||
auto currentStreamMinibatchData = compositeReaderMinibatchData.m_data[i];
|
||||
if (currentStreamDesc->m_elementType == ElementType::tfloat)
|
||||
{
|
||||
auto dataMatrix = std::make_shared<Matrix<float>>(CPUDEVICE);
|
||||
size_t sampleSize = currentStreamDesc->m_sampleLayout->GetNumElements();
|
||||
|
||||
// TODO: Eliminate the unnecessary CPU to CPU copy
|
||||
ReaderShim<float>::FillMatrixFromStream(currentStreamDesc->m_storageType, dataMatrix.get(), sampleSize, currentStreamMinibatchData);
|
||||
minibatchValuePtr = CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(sampleShape, *dataMatrix, currentStreamMinibatchData->m_layout, false);
|
||||
|
||||
size_t numSamples = currentStreamMinibatchData->m_layout->GetActualNumSamples();
|
||||
size_t numSequences = currentStreamMinibatchData->m_layout->GetNumSequences();
|
||||
|
||||
minibatchData[currentStreamInfo] = { numSequences, numSamples, minibatchValuePtr };
|
||||
}
|
||||
else
|
||||
LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
|
||||
}
|
||||
}
|
||||
|
||||
return minibatchData;
|
||||
}
|
||||
|
||||
void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
|
||||
std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndInvStdDevs,
|
||||
const DeviceDescriptor& device /*= DeviceDescriptor::CPUDevice()*/)
|
||||
{
|
||||
typedef std::shared_ptr<ComputationNode<float>> ComputationNodePtr;
|
||||
const auto& minibatchSourceStreams = minibatchSource->StreamInfos();
|
||||
|
||||
auto computationNetwork = std::make_shared<ComputationNetwork>(AsCNTKImplDeviceId(device));
|
||||
ComputationNetworkBuilder<float> builder(*computationNetwork);
|
||||
|
||||
std::vector<ComputationNodeBasePtr> allInputNodes;
|
||||
std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToInputNodeMap;
|
||||
std::unordered_map<StreamInfo, Variable> streamToDummyInputVariableMap;
|
||||
std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToMeanNodeMap;
|
||||
std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToInvStdDevNodeMap;
|
||||
|
||||
size_t totalSizePerSample = 0;
|
||||
for (auto& currentStreamKV : computedMeanAndInvStdDevs)
|
||||
{
|
||||
auto currentStreamInfo = currentStreamKV.first;
|
||||
if (minibatchSourceStreams.find(currentStreamInfo) == minibatchSourceStreams.end())
|
||||
InvalidArgument("ComputeMeanAndVariance: Stream for which mean and variance is to be computed is not supported by the specified minibatchSource");
|
||||
|
||||
if (currentStreamInfo.m_elementType != DataType::Float)
|
||||
LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
|
||||
|
||||
auto inputVariableShape = currentStreamInfo.m_sampleLayout;
|
||||
auto inputTensorShape = AsTensorShape(inputVariableShape);
|
||||
totalSizePerSample += (inputVariableShape.TotalSize() * sizeof(float));
|
||||
|
||||
ComputationNodePtr inputNode;
|
||||
Variable inputVariable;
|
||||
if (currentStreamInfo.m_storageFormat != StorageFormat::Dense)
|
||||
{
|
||||
inputNode = builder.CreateSparseInputNode(currentStreamInfo.m_name, inputTensorShape);
|
||||
inputVariable = Variable(inputVariableShape, true, DataType::Float, currentStreamInfo.m_name);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputNode = builder.CreateInputNode(currentStreamInfo.m_name, inputTensorShape);
|
||||
inputVariable = Variable(inputVariableShape, DataType::Float, currentStreamInfo.m_name);
|
||||
}
|
||||
|
||||
allInputNodes.push_back(inputNode);
|
||||
streamToInputNodeMap[currentStreamInfo] = inputNode;
|
||||
streamToDummyInputVariableMap[currentStreamInfo] = inputVariable;
|
||||
streamToMeanNodeMap[currentStreamInfo] = builder.Mean(inputNode);
|
||||
streamToInvStdDevNodeMap[currentStreamInfo] = builder.InvStdDev(inputNode);
|
||||
}
|
||||
|
||||
computationNetwork->CompileNetwork();
|
||||
computationNetwork->AllocateAllMatrices(computationNetwork->RootNodes(), {}, nullptr);
|
||||
|
||||
ScopedNetworkOperationMode modeGuard(computationNetwork, NetworkOperationMode::preComputing);
|
||||
|
||||
// initialize
|
||||
auto preComputeNodes = computationNetwork->GetNodesRequiringPreComputation();
|
||||
for (auto & preComputeNode : preComputeNodes)
|
||||
dynamic_pointer_cast<IPreComputeNode>(preComputeNode)->MarkComputed(false /*begin accumulating*/);
|
||||
|
||||
const size_t maxMinibatchDataSize = (1 << 27); // 128 MB
|
||||
const size_t minibatchSize = maxMinibatchDataSize / totalSizePerSample;
|
||||
std::unordered_map<StreamInfo, std::pair<size_t, size_t>> minibatchSizeLimits;
|
||||
for (auto& currentStreamKV : computedMeanAndInvStdDevs)
|
||||
minibatchSizeLimits.insert(std::make_pair(currentStreamKV.first, std::make_pair((size_t)0, minibatchSize)));
|
||||
|
||||
for (;;)
|
||||
{
|
||||
auto minibatchData = minibatchSource->GetNextMinibatch(minibatchSizeLimits, device);
|
||||
if (minibatchData.empty())
|
||||
break;
|
||||
|
||||
for (auto& currentStreamKV : computedMeanAndInvStdDevs)
|
||||
CompositeFunction::PopulateComputationNodeValue<float>({ streamToDummyInputVariableMap[currentStreamKV.first], minibatchData[currentStreamKV.first].m_data }, streamToInputNodeMap[currentStreamKV.first]);
|
||||
|
||||
ComputationNetwork::BumpEvalTimeStamp(allInputNodes);
|
||||
|
||||
computationNetwork->ForwardProp(preComputeNodes);
|
||||
}
|
||||
|
||||
// finalize
|
||||
for (auto & preComputeNode : preComputeNodes)
|
||||
dynamic_pointer_cast<IPreComputeNode>(preComputeNode)->MarkComputed(true /*done accumulating*/);
|
||||
|
||||
// Copy out the results
|
||||
for (auto& currentStreamKV : computedMeanAndInvStdDevs)
|
||||
{
|
||||
ValuePtr mean, invStdDev;
|
||||
if (computedMeanAndInvStdDevs[currentStreamKV.first].first != nullptr)
|
||||
mean = MakeSharedObject<Value>(computedMeanAndInvStdDevs[currentStreamKV.first].first);
|
||||
|
||||
if (computedMeanAndInvStdDevs[currentStreamKV.first].second != nullptr)
|
||||
invStdDev = MakeSharedObject<Value>(computedMeanAndInvStdDevs[currentStreamKV.first].second);
|
||||
|
||||
CompositeFunction::GetNodeOutputOrGradient(streamToDummyInputVariableMap[currentStreamKV.first], mean, streamToMeanNodeMap[currentStreamKV.first], false /*getGradient*/);
|
||||
CompositeFunction::GetNodeOutputOrGradient(streamToDummyInputVariableMap[currentStreamKV.first], invStdDev, streamToInvStdDevNodeMap[currentStreamKV.first], false /*getGradient*/);
|
||||
|
||||
if (computedMeanAndInvStdDevs[currentStreamKV.first].first == nullptr)
|
||||
computedMeanAndInvStdDevs[currentStreamKV.first].first = mean->Data();
|
||||
|
||||
if (computedMeanAndInvStdDevs[currentStreamKV.first].second == nullptr)
|
||||
computedMeanAndInvStdDevs[currentStreamKV.first].second = invStdDev->Data();
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "CNTKLibrary.h"
|
||||
#include "Utils.h"
|
||||
#include "Reader.h"
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
class CompositeMinibatchSource final : public MinibatchSource
|
||||
{
|
||||
public:
|
||||
CompositeMinibatchSource(const Dictionary& configuration);
|
||||
|
||||
virtual const std::unordered_set<StreamInfo>& StreamInfos() override { return m_streamInfos; }
|
||||
|
||||
virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
|
||||
const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) override;
|
||||
|
||||
private:
|
||||
std::unordered_set<StreamInfo> m_streamInfos;
|
||||
std::shared_ptr<Microsoft::MSR::CNTK::Reader> m_compositeDataReader;
|
||||
bool m_epochEndReached;
|
||||
size_t m_prevMinibatchSize;
|
||||
size_t m_epochSize;
|
||||
};
|
||||
}
|
|
@ -316,7 +316,17 @@ namespace CNTK
|
|||
}
|
||||
|
||||
template <typename ElementType>
|
||||
NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
|
||||
/*static*/ NDArrayViewPtr NDArrayView::RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/)
|
||||
{
|
||||
auto matrixDims = GetMatrixDimensions(shape);
|
||||
auto randomNormalMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomGaussian(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)mean, (ElementType)stdDev, seed));
|
||||
auto tensorView = new TensorView<ElementType>(randomNormalMatrix, AsTensorShape(shape));
|
||||
|
||||
return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
/*static*/ NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
|
||||
{
|
||||
auto matrixDims = GetMatrixDimensions(shape);
|
||||
auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeBegin, (ElementType)rangeEnd, seed));
|
||||
|
@ -329,6 +339,9 @@ namespace CNTK
|
|||
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
|
||||
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
|
||||
|
||||
template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
|
||||
template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<double>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
|
||||
|
||||
template CNTK_API const float* NDArrayView::DataBuffer<float>() const;
|
||||
template CNTK_API const double* NDArrayView::DataBuffer<double>() const;
|
||||
|
||||
|
@ -338,8 +351,10 @@ namespace CNTK
|
|||
template std::shared_ptr<const Matrix<float>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
|
||||
template std::shared_ptr<const Matrix<double>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
|
||||
|
||||
template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
|
||||
template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
|
||||
template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix<float>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
|
||||
template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix<double>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
|
||||
template TensorView<float>* NDArrayView::GetWritableTensorView<float>();
|
||||
template TensorView<double>* NDArrayView::GetWritableTensorView<double>();
|
||||
|
||||
template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
|
||||
|
|
|
@ -81,6 +81,24 @@ namespace CNTK
|
|||
GetMatrix()->SetValue(1);
|
||||
}
|
||||
|
||||
size_t NDMask::MaskedCount() const
|
||||
{
|
||||
auto maskMatrix = GetMatrix();
|
||||
std::unique_ptr<char[]> maskData(maskMatrix->CopyToArray());
|
||||
return std::count_if(maskData.get(), maskData.get() + maskMatrix->GetNumElements(), [](const char& val) {
|
||||
return val == 0;
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: This could actually be strided?
|
||||
const char* NDMask::DataBuffer() const
|
||||
{
|
||||
// First make sure that the underlying matrix is on the right device
|
||||
auto matrix = GetMatrix();
|
||||
matrix->TransferToDeviceIfNotThere(AsCNTKImplDeviceId(m_device), true);
|
||||
return matrix->Data();
|
||||
}
|
||||
|
||||
Matrix<char>* NDMask::GetMatrix() const
|
||||
{
|
||||
return m_matrixView.get();
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "CNTKLibrary.h"
|
||||
#include "Utils.h"
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
Trainer::Trainer(const FunctionPtr& model, const Variable& trainingLoss, const std::unordered_set<LearnerPtr>& parameterLearners)
|
||||
: m_model(model), m_trainingLossVar(trainingLoss), m_parameterLearners(parameterLearners)
|
||||
{
|
||||
auto modelParameters = model->Parameters();
|
||||
std::unordered_set<Parameter> learnerParameters;
|
||||
for (const auto& learner : parameterLearners)
|
||||
{
|
||||
const auto& currentLearnerParameters = learner->Parameters();
|
||||
for (const auto& parameter : currentLearnerParameters)
|
||||
{
|
||||
auto insertRetVal = learnerParameters.insert(parameter);
|
||||
if (!insertRetVal.second)
|
||||
InvalidArgument("Trainer::Trainer: Parameter named %S is covered by 2 different learners", parameter.Name().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if (modelParameters != learnerParameters)
|
||||
InvalidArgument("Trainer::Trainer: Union of the parameters covered by the specified parameterLearnes should match the specified model's parameters");
|
||||
}
|
||||
|
||||
bool Trainer::TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/)
|
||||
{
|
||||
std::unordered_map<Variable, ValuePtr> outputs = { { m_trainingLossVar, nullptr } };
|
||||
auto backPropSate = m_model->Forward(arguments, outputs, computeDevice, { m_trainingLossVar });
|
||||
m_prevMinibatchTrainingLossValue = outputs.begin()->second;
|
||||
|
||||
ValuePtr rootGradientValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(m_trainingLossVar.GetDataType(), outputs.at(m_trainingLossVar)->Data()->Shape(), computeDevice), outputs.at(m_trainingLossVar)->Mask());
|
||||
if (m_trainingLossVar.GetDataType() == DataType::Float)
|
||||
rootGradientValue->Data()->SetValue(1.0f);
|
||||
else
|
||||
rootGradientValue->Data()->SetValue(1.0);
|
||||
|
||||
auto modelParameters = m_model->Parameters();
|
||||
std::unordered_map<Variable, ValuePtr> parameterGradients;
|
||||
for (const auto& parameter : modelParameters)
|
||||
parameterGradients[parameter] = nullptr;
|
||||
|
||||
m_model->Backward(backPropSate, { { m_trainingLossVar, rootGradientValue } }, parameterGradients);
|
||||
|
||||
bool anyUpdatesPerformed = false;
|
||||
for (auto learner : m_parameterLearners)
|
||||
{
|
||||
std::unordered_map<Parameter, NDArrayViewPtr> learnerParameterGradients;
|
||||
const auto& learnerParameters = learner->Parameters();
|
||||
for (const auto& parameter : learnerParameters)
|
||||
{
|
||||
learnerParameterGradients[parameter] = parameterGradients[parameter]->Data();
|
||||
|
||||
if (parameterGradients[parameter]->Mask())
|
||||
LogicError("The gradient value for a Parameter cannot have an associated mask!");
|
||||
}
|
||||
|
||||
auto trainingLossArguments = m_trainingLossVar.Owner()->Arguments();
|
||||
auto labelsVar = *(std::find_if(trainingLossArguments.begin(), trainingLossArguments.end(), [](const Variable& var) {
|
||||
return var.IsInput();
|
||||
}));
|
||||
auto argumentValue = arguments.at(labelsVar);
|
||||
auto argumentData = argumentValue->Data();
|
||||
auto argumentDataShape = argumentData->Shape();
|
||||
auto mask = argumentValue->Mask();
|
||||
size_t numSamples = argumentDataShape[argumentDataShape.NumAxes() - 1] - ((mask != nullptr) ? mask->MaskedCount() : 0);
|
||||
anyUpdatesPerformed |= learner->Update(learnerParameterGradients, numSamples);
|
||||
}
|
||||
|
||||
return anyUpdatesPerformed;
|
||||
}
|
||||
}
|
|
@ -6,31 +6,162 @@
|
|||
#include "stdafx.h"
|
||||
#include "CNTKLibrary.h"
|
||||
#include "Utils.h"
|
||||
#include "File.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
template <typename T>
|
||||
void DictionaryValue::AllocateDataPtr(const T& value)
|
||||
{
|
||||
static_assert(is_same<T, NDShape>::value ||
|
||||
is_same<T, wstring>::value ||
|
||||
is_same<T, vector<DictionaryValue>>::value ||
|
||||
is_same<T, Dictionary>::value, "AllocateDataPtr called with invalid type");
|
||||
m_data.m_ptr = new T(value);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void DictionaryValue::FreePtrAsType()
|
||||
{
|
||||
T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
|
||||
delete typedPtr;
|
||||
|
||||
m_data.m_ptr = nullptr;
|
||||
}
|
||||
|
||||
Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us)
|
||||
{
|
||||
size_t version;
|
||||
stream >> version;
|
||||
|
||||
stream >> us.m_valueType;
|
||||
|
||||
switch (us.ValueType())
|
||||
{
|
||||
case DictionaryValue::Type::Bool:
|
||||
stream >> us.m_data.m_boolean;
|
||||
break;
|
||||
case DictionaryValue::Type::SizeT:
|
||||
stream >> us.m_data.m_sizeT;
|
||||
break;
|
||||
case DictionaryValue::Type::Float:
|
||||
stream >> us.m_data.m_float;
|
||||
break;
|
||||
case DictionaryValue::Type::Double:
|
||||
stream >> us.m_data.m_double;
|
||||
break;
|
||||
case DictionaryValue::Type::NDShape:
|
||||
{
|
||||
size_t size;
|
||||
stream >> size;
|
||||
vector<size_t> dims(size);
|
||||
for (auto i = 0; i < size; i++)
|
||||
{
|
||||
stream >> dims[i];
|
||||
}
|
||||
us.AllocateDataPtr(NDShape(dims));
|
||||
break;
|
||||
}
|
||||
case DictionaryValue::Type::Vector:
|
||||
{
|
||||
size_t size;
|
||||
stream >> size;
|
||||
vector<DictionaryValue> values(size);
|
||||
for (auto i = 0; i < size; i++)
|
||||
{
|
||||
stream >> values[i];
|
||||
}
|
||||
us.AllocateDataPtr(values);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
return stream;
|
||||
}
|
||||
|
||||
Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us)
|
||||
{
|
||||
stream << us.version;
|
||||
|
||||
stream << us.ValueType();
|
||||
|
||||
switch (us.ValueType())
|
||||
{
|
||||
case DictionaryValue::Type::Bool:
|
||||
stream << us.m_data.m_boolean;
|
||||
break;
|
||||
case DictionaryValue::Type::SizeT:
|
||||
stream << us.m_data.m_sizeT;
|
||||
break;
|
||||
case DictionaryValue::Type::Float:
|
||||
stream << us.m_data.m_float;
|
||||
break;
|
||||
case DictionaryValue::Type::Double:
|
||||
stream << us.m_data.m_double;
|
||||
break;
|
||||
case DictionaryValue::Type::NDShape:
|
||||
{
|
||||
NDShape* shapePtr = reinterpret_cast<NDShape*>(us.m_data.m_ptr);
|
||||
auto size = shapePtr->NumAxes();
|
||||
stream << size;
|
||||
for (auto i = 0; i < size; i++)
|
||||
{
|
||||
stream << shapePtr->operator[](i);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DictionaryValue::Type::Vector:
|
||||
{
|
||||
vector<DictionaryValue>* vectorPtr =
|
||||
reinterpret_cast<vector<DictionaryValue>*>(us.m_data.m_ptr);
|
||||
auto size = vectorPtr->size();
|
||||
stream << size;
|
||||
for (auto i = 0; i < size; i++)
|
||||
{
|
||||
stream << vectorPtr->operator[](i);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
return stream;
|
||||
}
|
||||
|
||||
Dictionary::Dictionary()
|
||||
: m_dictionaryData(new std::unordered_map < std::wstring, DictionaryValue>)
|
||||
: m_dictionaryData(new unordered_map <wstring, DictionaryValue>)
|
||||
{
|
||||
}
|
||||
|
||||
Dictionary::~Dictionary()
|
||||
{
|
||||
delete m_dictionaryData;
|
||||
}
|
||||
|
||||
Dictionary::Dictionary(const Dictionary& other)
|
||||
{
|
||||
*this = other;
|
||||
}
|
||||
|
||||
Dictionary& Dictionary::operator=(const Dictionary& other)
|
||||
{
|
||||
assert(this != &other);
|
||||
m_dictionaryData.reset(new std::unordered_map<std::wstring, DictionaryValue>(*(other.m_dictionaryData)));
|
||||
return *this;
|
||||
}
|
||||
|
||||
Dictionary::Dictionary(Dictionary&& other)
|
||||
: m_dictionaryData(nullptr)
|
||||
{
|
||||
*this = std::move(other);
|
||||
*this = move(other);
|
||||
}
|
||||
|
||||
Dictionary& Dictionary::operator=(Dictionary&& other)
|
||||
{
|
||||
assert(this != &other);
|
||||
|
||||
delete m_dictionaryData;
|
||||
|
||||
m_dictionaryData = other.m_dictionaryData;
|
||||
other.m_dictionaryData = nullptr;
|
||||
|
||||
|
@ -51,4 +182,137 @@ namespace CNTK
|
|||
{
|
||||
return (m_dictionaryData->find(key) != m_dictionaryData->end());
|
||||
}
|
||||
|
||||
Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us)
|
||||
{
|
||||
stream << us.version;
|
||||
stream << us.m_dictionaryData->size();
|
||||
for (auto it = us.m_dictionaryData->begin(); it != us.m_dictionaryData->end(); ++it)
|
||||
{
|
||||
stream << it->first;
|
||||
stream << it->second;
|
||||
}
|
||||
return stream;
|
||||
}
|
||||
|
||||
Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us)
|
||||
{
|
||||
size_t version;
|
||||
stream >> version;
|
||||
size_t size;
|
||||
stream >> size;
|
||||
us.m_dictionaryData->reserve(size);
|
||||
for (auto i = 0; i < size; i++)
|
||||
{
|
||||
wstring key;
|
||||
stream >> key;
|
||||
DictionaryValue value;
|
||||
stream >> value;
|
||||
us.m_dictionaryData->insert(make_pair(key, value));
|
||||
}
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
|
||||
{
|
||||
if (viewPtr->IsSparse())
|
||||
{
|
||||
LogicError("Sparse NDArrayView cannot be serialized into a vector.");
|
||||
}
|
||||
|
||||
auto numElements = viewPtr->Shape().TotalSize();
|
||||
|
||||
vector<DictionaryValue> values(numElements);
|
||||
|
||||
NDArrayViewPtr cpuDataViewPtr = viewPtr;
|
||||
if ((viewPtr->Device().Type() != DeviceKind::CPU))
|
||||
{
|
||||
cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
|
||||
cpuDataViewPtr->CopyFrom(*viewPtr);
|
||||
}
|
||||
|
||||
const T* buffer = cpuDataViewPtr->DataBuffer<T>();
|
||||
for (auto i = 0; i < numElements; ++i)
|
||||
{
|
||||
T v = buffer[i];
|
||||
values[i] = DictionaryValue(v);
|
||||
}
|
||||
|
||||
return values;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
|
||||
{
|
||||
if (viewPtr->IsSparse())
|
||||
{
|
||||
LogicError("Sparse NDArrayView cannot be deserialized from a vector.");
|
||||
}
|
||||
|
||||
auto numElements = viewPtr->Shape().TotalSize();
|
||||
|
||||
if (values.size() != numElements)
|
||||
{
|
||||
LogicError("Number of elements (%lu) in the deserialized representation does not match the expected value (%lu)",
|
||||
values.size(), numElements);
|
||||
}
|
||||
|
||||
NDArrayViewPtr cpuDataViewPtr = viewPtr;
|
||||
if ((viewPtr->Device().Type() != DeviceKind::CPU))
|
||||
{
|
||||
cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
|
||||
}
|
||||
|
||||
T* buffer = cpuDataViewPtr->WritableDataBuffer<T>();
|
||||
for (auto i = 0; i < numElements; ++i)
|
||||
{
|
||||
buffer[i] = values[i].GetValue<T>();
|
||||
}
|
||||
|
||||
if ((viewPtr->Device().Type() != DeviceKind::CPU))
|
||||
{
|
||||
viewPtr->CopyFrom(*cpuDataViewPtr);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: we store the type info for every element in the vector, which is extremely redundant.
|
||||
// Instead, it'd be nice to introduce some sort of DictionaryValueVector.
|
||||
vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
|
||||
{
|
||||
switch (viewPtr->GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
return SerializeToVector<float>(viewPtr);
|
||||
case DataType::Double:
|
||||
return SerializeToVector<double>(viewPtr);
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
|
||||
}
|
||||
}
|
||||
|
||||
void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
|
||||
{
|
||||
switch (viewPtr->GetDataType())
|
||||
{
|
||||
case DataType::Float:
|
||||
DeserializeFromVector<float>(viewPtr, values);
|
||||
break;
|
||||
case DataType::Double:
|
||||
DeserializeFromVector<double>(viewPtr, values);
|
||||
break;
|
||||
default:
|
||||
LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
|
||||
}
|
||||
}
|
||||
|
||||
template void DictionaryValue::AllocateDataPtr<NDShape>(const NDShape& value);
|
||||
template void DictionaryValue::AllocateDataPtr<vector<DictionaryValue>>(const vector<DictionaryValue>& value);
|
||||
template void DictionaryValue::AllocateDataPtr<wstring>(const wstring& value);
|
||||
template void DictionaryValue::AllocateDataPtr<Dictionary>(const Dictionary& value);
|
||||
|
||||
template void DictionaryValue::FreePtrAsType<NDShape>();
|
||||
template void DictionaryValue::FreePtrAsType<vector<DictionaryValue>>();
|
||||
template void DictionaryValue::FreePtrAsType<wstring>();
|
||||
template void DictionaryValue::FreePtrAsType<Dictionary>();
|
||||
}
|
||||
|
|
|
@ -9,251 +9,15 @@
|
|||
#include "CommonMatrix.h"
|
||||
#include "TensorShape.h"
|
||||
#include <string>
|
||||
#include "Config.h"
|
||||
#include "Reader.h"
|
||||
#include "ConvolutionEngine.h"
|
||||
|
||||
namespace CNTK
|
||||
{
|
||||
// Forward declarations
|
||||
class Dictionary;
|
||||
|
||||
class DictionaryValue
|
||||
{
|
||||
public:
|
||||
enum class Type : unsigned int
|
||||
{
|
||||
None,
|
||||
Bool,
|
||||
SizeT,
|
||||
Double,
|
||||
NDShape,
|
||||
Vector
|
||||
};
|
||||
|
||||
static const char* TypeName(Type type)
|
||||
{
|
||||
if (type == Type::None)
|
||||
return "None";
|
||||
else if (type == Type::Bool)
|
||||
return "Bool";
|
||||
else if (type == Type::SizeT)
|
||||
return "SizeT";
|
||||
else if (type == Type::Double)
|
||||
return "Double";
|
||||
else if (type == Type::NDShape)
|
||||
return "NDShape";
|
||||
else if (type == Type::Vector)
|
||||
return "Vector";
|
||||
else
|
||||
LogicError("Unknown DictionaryValue::Type");
|
||||
}
|
||||
|
||||
public:
|
||||
DictionaryValue()
|
||||
: m_valueType(Type::None)
|
||||
{
|
||||
}
|
||||
|
||||
DictionaryValue(bool value)
|
||||
: m_valueType(GetValueType<bool>())
|
||||
{
|
||||
m_data.m_boolean = value;
|
||||
}
|
||||
|
||||
DictionaryValue(size_t value)
|
||||
: m_valueType(GetValueType<size_t>())
|
||||
{
|
||||
m_data.m_sizeT = value;
|
||||
}
|
||||
|
||||
DictionaryValue(double value)
|
||||
: m_valueType(GetValueType<double>())
|
||||
{
|
||||
m_data.m_double = value;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
DictionaryValue(const T& value)
|
||||
: m_valueType(GetValueType<T>())
|
||||
{
|
||||
static_assert(std::is_same<T, NDShape>::value ||
|
||||
std::is_same<T, std::vector<DictionaryValue>>::value,
|
||||
"Unsupported ValueType");
|
||||
|
||||
AllocateDataPtr(value);
|
||||
}
|
||||
|
||||
DictionaryValue(const DictionaryValue& other)
|
||||
: m_valueType(Type::Bool)
|
||||
{
|
||||
// The m_valueType must hvae been set to a non-ptr type to prevent an attempt to interpret
|
||||
// the underlying underlying uninitialized value as a ptr and free it.
|
||||
*this = other;
|
||||
}
|
||||
|
||||
DictionaryValue& operator=(const DictionaryValue& other)
|
||||
{
|
||||
if (this != &other)
|
||||
{
|
||||
FreeDataPtr();
|
||||
|
||||
m_valueType = other.m_valueType;
|
||||
m_data = other.m_data;
|
||||
|
||||
if (other.m_valueType == Type::NDShape)
|
||||
AllocateDataPtr(other.GetValue<NDShape>());
|
||||
else if (other.m_valueType == Type::Vector)
|
||||
AllocateDataPtr(other.GetValue<std::vector<DictionaryValue>>());
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
~DictionaryValue()
|
||||
{
|
||||
FreeDataPtr();
|
||||
}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_same<T, bool>::value>::type* = nullptr>
|
||||
const T& GetValue() const
|
||||
{
|
||||
VerifyType<T>();
|
||||
return m_data.m_boolean;
|
||||
}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_same<T, size_t>::value>::type* = nullptr>
|
||||
const T& GetValue() const
|
||||
{
|
||||
VerifyType<T>();
|
||||
return m_data.m_sizeT;
|
||||
}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_same<T, double>::value>::type* = nullptr>
|
||||
const T& GetValue() const
|
||||
{
|
||||
VerifyType<T>();
|
||||
return m_data.m_double;
|
||||
}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value || std::is_same<T, std::vector<DictionaryValue>>::value>::type* = nullptr>
|
||||
const T& GetValue() const
|
||||
{
|
||||
VerifyType<T>();
|
||||
return *(reinterpret_cast<T*>(m_data.m_ptr));
|
||||
}
|
||||
|
||||
bool HasValue() const
|
||||
{
|
||||
return m_valueType != Type::None;
|
||||
}
|
||||
|
||||
Type ValueType() const
|
||||
{
|
||||
return m_valueType;
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static Type GetValueType()
|
||||
{
|
||||
static_assert(std::is_same<T, bool>::value ||
|
||||
std::is_same<T, size_t>::value ||
|
||||
std::is_same<T, double>::value ||
|
||||
std::is_same<T, NDShape>::value ||
|
||||
std::is_same<T, std::vector<DictionaryValue>>::value ||
|
||||
std::is_same<T, CNTK::Dictionary>::value,
|
||||
"Unsupported ValueType");
|
||||
|
||||
if (std::is_same<T, bool>::value)
|
||||
return Type::Bool;
|
||||
else if (std::is_same<T, size_t>::value)
|
||||
return Type::SizeT;
|
||||
else if (std::is_same<T, double>::value)
|
||||
return Type::Double;
|
||||
else if (std::is_same<T, NDShape>::value)
|
||||
return Type::NDShape;
|
||||
else if (std::is_same<T, std::vector<DictionaryValue>>::value)
|
||||
return Type::Vector;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void VerifyType() const
|
||||
{
|
||||
if (GetValueType<T>() != m_valueType)
|
||||
RuntimeError("Reading a DictionaryValue as the wrong type; Reading as type %s when actual type is %s", typeid(T).name(), DictionaryValue::TypeName(m_valueType));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void AllocateDataPtr(const T& value)
|
||||
{
|
||||
static_assert(std::is_same<T, NDShape>::value || std::is_same<T, std::vector<DictionaryValue>>::value, "AllocateDataPtr called with invalid type");
|
||||
m_data.m_ptr = new T(value);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void FreePtrAsType()
|
||||
{
|
||||
T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
|
||||
delete typedPtr;
|
||||
|
||||
m_data.m_ptr = nullptr;
|
||||
}
|
||||
|
||||
void FreeDataPtr()
|
||||
{
|
||||
if (m_valueType == Type::NDShape)
|
||||
FreePtrAsType<NDShape>();
|
||||
else if (m_valueType == Type::Vector)
|
||||
FreePtrAsType<std::vector<DictionaryValue>>();
|
||||
}
|
||||
|
||||
private:
|
||||
Type m_valueType;
|
||||
|
||||
union ValueData
|
||||
{
|
||||
bool m_boolean;
|
||||
size_t m_sizeT;
|
||||
double m_double;
|
||||
void* m_ptr;
|
||||
} m_data;
|
||||
};
|
||||
|
||||
class Dictionary
|
||||
{
|
||||
public:
|
||||
Dictionary();
|
||||
~Dictionary();
|
||||
|
||||
// Disallow copy contruction and assignment
|
||||
Dictionary(const Dictionary&) = delete; Dictionary& operator=(const Dictionary&) = delete;
|
||||
|
||||
Dictionary(Dictionary&& other);
|
||||
Dictionary& operator=(Dictionary&& other);
|
||||
|
||||
DictionaryValue& operator[](const std::wstring& key)
|
||||
{
|
||||
return operator[](key.c_str());
|
||||
}
|
||||
|
||||
DictionaryValue& operator[](const wchar_t* key);
|
||||
|
||||
DictionaryValue operator[](const std::wstring& key) const
|
||||
{
|
||||
return operator[](key.c_str());
|
||||
}
|
||||
|
||||
DictionaryValue operator[](const wchar_t* key) const;
|
||||
|
||||
bool Contains(const std::wstring& key) const
|
||||
{
|
||||
return Contains(key.c_str());
|
||||
}
|
||||
|
||||
bool Contains(const wchar_t* key) const;
|
||||
|
||||
private:
|
||||
std::unordered_map<std::wstring, DictionaryValue>* m_dictionaryData;
|
||||
};
|
||||
|
||||
// Helper to get the size of an element of the specified DataType
|
||||
inline size_t ElementSize(DataType dataType)
|
||||
{
|
||||
|
@ -317,14 +81,53 @@ namespace CNTK
|
|||
LogicError("Unknown DataType");
|
||||
}
|
||||
|
||||
inline Microsoft::MSR::CNTK::TensorShape AsTensorShape(const NDShape& viewShape)
|
||||
inline NDShape AsNDShape(const Microsoft::MSR::CNTK::TensorShape& tensorShape)
|
||||
{
|
||||
// The TensorShape should be flattenable to 1D
|
||||
for (size_t i = 1; i < tensorShape.GetRank(); ++i)
|
||||
{
|
||||
if (!tensorShape.CanFlatten(i))
|
||||
InvalidArgument("AsNDShape() can only be called for TensorShapes that can be flattened to 1D");
|
||||
}
|
||||
|
||||
return std::vector<size_t>(tensorShape.GetDims().begin(), tensorShape.GetDims().end());
|
||||
}
|
||||
|
||||
inline DataType AsDataType(Microsoft::MSR::CNTK::ElementType readerDataType)
|
||||
{
|
||||
switch (readerDataType)
|
||||
{
|
||||
case Microsoft::MSR::CNTK::ElementType::tfloat:
|
||||
return DataType::Float;
|
||||
case Microsoft::MSR::CNTK::ElementType::tdouble:
|
||||
return DataType::Double;
|
||||
default:
|
||||
LogicError("Unsupported ElementType from CNTK Reader");
|
||||
}
|
||||
}
|
||||
|
||||
inline StorageFormat AsStorageFormat(Microsoft::MSR::CNTK::StorageType readerStorageType)
|
||||
{
|
||||
switch (readerStorageType)
|
||||
{
|
||||
case Microsoft::MSR::CNTK::StorageType::dense:
|
||||
return StorageFormat::Dense;
|
||||
case Microsoft::MSR::CNTK::StorageType::sparse_csc:
|
||||
return StorageFormat::SparseCSC;
|
||||
default:
|
||||
LogicError("Unsupported StorageType from CNTK Reader");
|
||||
}
|
||||
}
|
||||
|
||||
inline Microsoft::MSR::CNTK::TensorShape AsTensorShape(const NDShape& viewShape, bool preserveRank = false)
|
||||
{
|
||||
const size_t maxNumAxesSupportedByTensorView = 12;
|
||||
if (viewShape.NumAxes() > maxNumAxesSupportedByTensorView)
|
||||
LogicError("The number of requested axes exceeds the currently supported limit");
|
||||
|
||||
// TensorShape is required to be at least 2D
|
||||
Microsoft::MSR::CNTK::SmallVector<size_t> tensorViewShape(std::max<size_t>(2, viewShape.NumAxes()));
|
||||
size_t minRankSize = preserveRank ? viewShape.NumAxes() : 2;
|
||||
Microsoft::MSR::CNTK::SmallVector<size_t> tensorViewShape(std::max<size_t>(minRankSize, viewShape.NumAxes()));
|
||||
for (size_t i = 0; i < tensorViewShape.size(); ++i)
|
||||
tensorViewShape[i] = (i < viewShape.NumAxes()) ? viewShape[i] : 1;
|
||||
|
||||
|
@ -363,4 +166,151 @@ namespace CNTK
|
|||
{
|
||||
return var.IsInput() && var.IsSparse();
|
||||
}
|
||||
|
||||
std::vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr);
|
||||
|
||||
void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const std::vector<DictionaryValue>& values);
|
||||
|
||||
inline void AddIndentation(std::wstringstream& s, size_t numIndentationSpaces)
|
||||
{
|
||||
for (size_t i = 0; i < numIndentationSpaces; ++i)
|
||||
s << L" ";
|
||||
}
|
||||
|
||||
static const size_t perLevelIndentSize = 4;
|
||||
inline void AddConfigString(std::wstringstream& s, const std::wstring& key, const DictionaryValue& value, size_t numIndentationSpaces);
|
||||
inline void AddConfigString(std::wstringstream& s, const DictionaryValue& value, size_t numIndentationSpaces)
|
||||
{
|
||||
switch (value.ValueType())
|
||||
{
|
||||
case DictionaryValue::Type::Bool:
|
||||
s << value.GetValue<bool>();
|
||||
break;
|
||||
case DictionaryValue::Type::Float:
|
||||
s << value.GetValue<float>();
|
||||
break;
|
||||
case DictionaryValue::Type::Double:
|
||||
s << value.GetValue<double>();
|
||||
break;
|
||||
case DictionaryValue::Type::String:
|
||||
s << value.GetValue<std::wstring>();
|
||||
break;
|
||||
case DictionaryValue::Type::SizeT:
|
||||
s << value.GetValue<size_t>();
|
||||
break;
|
||||
case DictionaryValue::Type::Vector:
|
||||
{
|
||||
const auto& valueVector = value.GetValue<std::vector<DictionaryValue>>();
|
||||
s << L"(" << std::endl;
|
||||
AddIndentation(s, numIndentationSpaces + perLevelIndentSize);
|
||||
bool isFirst = true;
|
||||
for (const auto& val : valueVector)
|
||||
{
|
||||
if (!isFirst)
|
||||
s << L":";
|
||||
else
|
||||
isFirst = false;
|
||||
|
||||
AddConfigString(s, val, numIndentationSpaces + perLevelIndentSize);
|
||||
}
|
||||
AddIndentation(s, numIndentationSpaces);
|
||||
s << L")";
|
||||
break;
|
||||
}
|
||||
case DictionaryValue::Type::Dictionary:
|
||||
{
|
||||
const auto& valueDictionary = value.GetValue<Dictionary>();
|
||||
s << L"[" << std::endl;
|
||||
for (const auto& keyValuePair : *(valueDictionary.m_dictionaryData))
|
||||
{
|
||||
AddConfigString(s, keyValuePair.first, keyValuePair.second, numIndentationSpaces + perLevelIndentSize);
|
||||
}
|
||||
AddIndentation(s, numIndentationSpaces);
|
||||
s << L"]";
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogicError("Unsupported DictionaryValue type");
|
||||
}
|
||||
}
|
||||
|
||||
inline void AddConfigString(std::wstringstream& s, const std::wstring& key, const DictionaryValue& value, size_t numIndentationSpaces)
|
||||
{
|
||||
static const size_t perLevelIndentSize = 4;
|
||||
|
||||
AddIndentation(s, numIndentationSpaces);
|
||||
s << key << L" = ";
|
||||
AddConfigString(s, value, numIndentationSpaces);
|
||||
s << std::endl;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::vector<DictionaryValue> AsDictionaryValueVector(const std::vector<T>& basicElementTypeVector)
|
||||
{
|
||||
static_assert(std::is_same<T, bool>::value ||
|
||||
std::is_same<T, size_t>::value ||
|
||||
std::is_same<T, float>::value ||
|
||||
std::is_same<T, double>::value, "Unsupported ValueType");
|
||||
|
||||
std::vector<DictionaryValue> dictionaryValueVector;
|
||||
for (auto value : basicElementTypeVector)
|
||||
dictionaryValueVector.push_back(value);
|
||||
|
||||
return dictionaryValueVector;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::vector<T> AsBasicElementTypeVector(const std::vector<DictionaryValue>& dictionaryValueVector)
|
||||
{
|
||||
static_assert(std::is_same<T, bool>::value ||
|
||||
std::is_same<T, size_t>::value ||
|
||||
std::is_same<T, float>::value ||
|
||||
std::is_same<T, double>::value, "Unsupported ValueType");
|
||||
|
||||
std::vector<T> basicElementTypeVector;
|
||||
for (auto value : dictionaryValueVector)
|
||||
basicElementTypeVector.push_back(value.GetValue<T>());
|
||||
|
||||
return basicElementTypeVector;
|
||||
}
|
||||
|
||||
inline PoolingType AsPoolingType(Microsoft::MSR::CNTK::PoolKind cntkPoolingKind)
|
||||
{
|
||||
switch (cntkPoolingKind)
|
||||
{
|
||||
case Microsoft::MSR::CNTK::PoolKind::Average:
|
||||
return PoolingType::Average;
|
||||
case Microsoft::MSR::CNTK::PoolKind::Max:
|
||||
return PoolingType::Max;
|
||||
default:
|
||||
LogicError("Unknown pooling type");
|
||||
}
|
||||
}
|
||||
|
||||
inline Microsoft::MSR::CNTK::PoolKind AsCNTKPoolKind(PoolingType poolingType)
|
||||
{
|
||||
switch (poolingType)
|
||||
{
|
||||
case PoolingType::Average:
|
||||
return Microsoft::MSR::CNTK::PoolKind::Average;
|
||||
case PoolingType::Max:
|
||||
return Microsoft::MSR::CNTK::PoolKind::Max;
|
||||
default:
|
||||
LogicError("Unknown pooling type");
|
||||
}
|
||||
}
|
||||
|
||||
inline std::pair<NDShape, NDShape> GetConvolutionOutputMapCountAndKernelShape(const NDShape& convolutionMapShape, const NDShape& operandShape)
|
||||
{
|
||||
auto outputMapCount = convolutionMapShape.SubShape(0, convolutionMapShape.NumAxes() - operandShape.NumAxes());
|
||||
NDShape paddedOutputMapCount(operandShape.NumAxes(), 1);
|
||||
for (size_t i = 0; i < outputMapCount.NumAxes(); ++i)
|
||||
paddedOutputMapCount[paddedOutputMapCount.NumAxes() - 1 - i] = outputMapCount[outputMapCount.NumAxes() - 1 - i];
|
||||
//for (size_t i = 0; i < outputMapCount.NumAxes(); ++i)
|
||||
// paddedOutputMapCount[i] = outputMapCount[i];
|
||||
|
||||
NDShape kernelShape = convolutionMapShape.SubShape(outputMapCount.NumAxes());
|
||||
|
||||
return{ paddedOutputMapCount, kernelShape };
|
||||
}
|
||||
}
|
||||
|
|
|
@ -84,9 +84,15 @@ __declspec_noreturn static inline void ThrowFormatted(const char* format, ...)
|
|||
|
||||
// RuntimeError - throw a std::runtime_error with a formatted error string
|
||||
#ifndef _MSC_VER // gcc __attribute__((format(printf())) does not percolate through variadic templates; so must go the macro route
|
||||
#ifndef RuntimeError
|
||||
#define RuntimeError ThrowFormatted<std::runtime_error>
|
||||
#endif
|
||||
#ifndef LogicError
|
||||
#define LogicError ThrowFormatted<std::logic_error>
|
||||
#endif
|
||||
#ifndef InvalidArgument
|
||||
#define InvalidArgument ThrowFormatted<std::invalid_argument>
|
||||
#endif
|
||||
#else
|
||||
template <class... _Types>
|
||||
__declspec_noreturn static inline void RuntimeError(const char* format, _Types&&... _Args)
|
||||
|
@ -127,13 +133,11 @@ static inline void Warning(const string& message)
|
|||
\
|
||||
{ \
|
||||
fprintf(stderr, "Inside File: %s Line: %d Function: %s -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__); \
|
||||
LogicError("Inside File: %s Line: %d Function: %s -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__); \
|
||||
LogicError("Inside File: %s Line: %d Function: %s -> Feature Not Implemented.", __FILE__, __LINE__, __FUNCTION__); \
|
||||
\
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
||||
#ifndef _MSC_VER
|
||||
using Microsoft::MSR::CNTK::ThrowFormatted;
|
||||
|
@ -579,6 +583,60 @@ struct nocase_compare
|
|||
// random collection of stuff we needed at some place
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
// Array class
|
||||
template <class T>
|
||||
class ArrayRef
|
||||
{
|
||||
T* elements; // Array of type T
|
||||
size_t count;
|
||||
|
||||
public:
|
||||
|
||||
ArrayRef(T* elementsIn, size_t sizeIn)
|
||||
{
|
||||
elements = elementsIn;
|
||||
count = sizeIn;
|
||||
}
|
||||
|
||||
// TODO: Copy Constructor
|
||||
ArrayRef(const ArrayRef& other) = delete;
|
||||
|
||||
// TODO: Move Constructor
|
||||
ArrayRef(ArrayRef&& other) = delete;
|
||||
|
||||
// TODO: Assignment operator
|
||||
ArrayRef& operator=(const ArrayRef& rhs) = delete;
|
||||
|
||||
// TODO: Move assignment operator
|
||||
ArrayRef& operator=(ArrayRef&& rhs) = delete;
|
||||
|
||||
size_t size() const { return count; }
|
||||
T* data() const { return elements; }
|
||||
|
||||
T operator[](size_t i) const
|
||||
{
|
||||
if (i >= size())
|
||||
LogicError("ArrayRef: index overflow");
|
||||
return elements[i];
|
||||
}
|
||||
|
||||
T& operator[](size_t i)
|
||||
{
|
||||
if (i >= count)
|
||||
LogicError("ArrayRef: index overflow");
|
||||
return elements[i];
|
||||
}
|
||||
|
||||
const T* begin() const
|
||||
{
|
||||
return data();
|
||||
}
|
||||
const T* end() const
|
||||
{
|
||||
return data() + size();
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: maybe change to type id of an actual thing we pass in
|
||||
// TODO: is this header appropriate?
|
||||
template <class C>
|
||||
|
|
|
@ -988,11 +988,10 @@ public:
|
|||
return defaultValue;
|
||||
}
|
||||
|
||||
ConfigValue Find(const std::string& name,
|
||||
const char* defaultvalue = NULL) const
|
||||
// Look up a variable through the nested hierarchy. If not found, return false, and 'result'is untouched.
|
||||
bool TryFind(const std::string& name, ConfigValue& result, const char* defaultvalue = NULL) const
|
||||
{
|
||||
auto iter = find(name);
|
||||
ConfigValue result;
|
||||
|
||||
// if we aren't found, or they want the default value
|
||||
// TODO: What the hell is this?
|
||||
|
@ -1002,13 +1001,15 @@ public:
|
|||
if (iter == end() && m_parent != NULL)
|
||||
{
|
||||
result = m_parent->Find(name, defaultvalue);
|
||||
return true;
|
||||
}
|
||||
else if (defaultvalue != NULL)
|
||||
{
|
||||
// no parent, so use default value
|
||||
std::string fullName = m_configName + ":" + name;
|
||||
result = ConfigValue(defaultvalue, fullName, this);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1016,10 +1017,19 @@ public:
|
|||
rhs = this->ResolveVariables(rhs);
|
||||
std::string fullName = m_configName + ":" + name;
|
||||
result = ConfigValue(rhs, fullName, this);
|
||||
}
|
||||
return result;
|
||||
return true;
|
||||
}
|
||||
return false; // not found
|
||||
}
|
||||
|
||||
// Look up a variable using TryFind() above. If not found, return empty string.
|
||||
ConfigValue Find(const std::string& name, const char* defaultvalue = NULL) const
|
||||
{
|
||||
ConfigValue result;
|
||||
TryFind(name, result, defaultvalue); // (if returns false, we return an empty ConfigValue)
|
||||
return result;
|
||||
}
|
||||
|
||||
// ResolveVariablesInSingleLine - In this method we replace all substrings of 'configLine' of the form "$varName$"
|
||||
// (where varName is a variable name), with the value of the "varName" variable in config.
|
||||
// We search up the config tree for the value, and we throw an error if we don't find it.
|
||||
|
@ -1037,10 +1047,7 @@ public:
|
|||
{
|
||||
// ensure that this method was called on a single line (eg, no newline characters exist in 'configLine').
|
||||
if (configLine.find_first_of("\n") != std::string::npos)
|
||||
{
|
||||
LogicError(
|
||||
"\"ResolveVariablesInSingleLine\" shouldn't be called with a string containing a newline character");
|
||||
}
|
||||
LogicError("ResolveVariablesInSingleLine() should not be called with a string containing a newline character");
|
||||
|
||||
std::string newConfigLine = StripComments(configLine);
|
||||
std::size_t start = newConfigLine.find_first_of(openBraceVar);
|
||||
|
@ -1073,27 +1080,25 @@ public:
|
|||
// in nested dictionaries, this is not working.
|
||||
if (varName.empty())
|
||||
{
|
||||
RuntimeError("$$ is not allowed. Parsing of string failed: %s:%s",
|
||||
RuntimeError("$$ is not allowed. Parsing of string failed: %s:%s",
|
||||
m_configName.c_str(),
|
||||
newConfigLine.c_str());
|
||||
}
|
||||
|
||||
// Note that this call to "Find" can trigger further substitutions of the form $varName2$ -> varValue2,
|
||||
// thus making this search process recursive.
|
||||
std::string varValue = this->Find(varName);
|
||||
|
||||
if (varValue.empty())
|
||||
ConfigValue varConfigValue;
|
||||
const bool foundValue = this->TryFind(varName, varConfigValue);
|
||||
if (!foundValue)
|
||||
{
|
||||
RuntimeError("No variable found with the name %s. Parsing of string failed: %s:%s",
|
||||
RuntimeError("No variable found with the name %s. Parsing of string failed: %s:%s",
|
||||
varName.c_str(), m_configName.c_str(),
|
||||
newConfigLine.c_str());
|
||||
}
|
||||
|
||||
if (varValue.find_first_of("\n") != std::string::npos)
|
||||
{
|
||||
LogicError(
|
||||
"Newline character cannot be contained in the value of a variable which is resolved using $varName$ feature");
|
||||
}
|
||||
std::string varValue = varConfigValue;
|
||||
if (varValue.find_first_of("\n") != std::string::npos)
|
||||
LogicError("Newline characters are not allowed in the value of a variable which is resolved using $varName$ feature");
|
||||
|
||||
// Replace $varName$ with 'varValue'. Then continue the search for
|
||||
// other variables in 'newConfigLine' string, starting at the point
|
||||
|
|
|
@ -282,7 +282,7 @@ class VariableSchema : public std::vector<VariableLayout>
|
|||
Values<ElemType> CreateBuffers(const std::vector<size_t>& maxLengths)
|
||||
{
|
||||
if (maxLengths.size() != size())
|
||||
throw std::exception("Expected max lengths for all variables.");
|
||||
throw std::runtime_error("Expected max lengths for all variables.");
|
||||
|
||||
Values<ElemType> buffers(size());
|
||||
for (size_t i = 0; i < size(); ++i)
|
||||
|
|
|
@ -134,4 +134,5 @@ public:
|
|||
return randomizationrange == randomizeDisable;
|
||||
}
|
||||
};
|
||||
} } }
|
||||
|
||||
}}}
|
||||
|
|
|
@ -29,7 +29,8 @@ public:
|
|||
runtime_error(msg)
|
||||
{
|
||||
}
|
||||
virtual void PrintError(const std::wstring& linePrefix) const = 0;
|
||||
virtual std::wstring GetError(const std::wstring& /*linePrefix*/) const = 0;
|
||||
virtual void PrintError(const std::wstring& /*linePrefix*/) const = 0;
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -619,9 +620,9 @@ public:
|
|||
{
|
||||
}
|
||||
// ConfigArray(ConfigValuePtr && val) : firstIndex(0), values(std::vector<ConfigValuePtr>{ move(val) }) { }
|
||||
pair<int, int> GetIndexRange() const
|
||||
pair<int, int> GetIndexBeginEnd() const
|
||||
{
|
||||
return make_pair(firstIndex, firstIndex + (int) values.size() - 1);
|
||||
return make_pair(firstIndex, firstIndex + (int)values.size());
|
||||
}
|
||||
// for use as a plain array: get size and verify that index range starts with 0
|
||||
template <typename FAILFN>
|
||||
|
|
|
@ -411,7 +411,7 @@ static inline void byteswap(V &v) throw()
|
|||
|
||||
// execute a block with retry
|
||||
// Block must be restartable.
|
||||
// Use this when writing small files to those unreliable Windows servers.
|
||||
// Use this when writing/reading small files to those unreliable Windows servers.
|
||||
// TODO: This will fail to compile under VS 2008--we need an #ifdef around this
|
||||
template <typename FUNCTION>
|
||||
static void attempt(int retries, const FUNCTION &body)
|
|
@ -592,7 +592,8 @@ void fgetfile(const std::wstring& pathname, std::vector<char>& buffer);
|
|||
void fgetfile(FILE* f, std::vector<char>& buffer);
|
||||
namespace msra { namespace files {
|
||||
|
||||
void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines);
|
||||
void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines, int numberOfTries = 1);
|
||||
|
||||
static inline std::vector<std::string> fgetfilelines(const std::wstring& pathname)
|
||||
{
|
||||
std::vector<char> buffer;
|
||||
|
@ -600,7 +601,7 @@ static inline std::vector<std::string> fgetfilelines(const std::wstring& pathnam
|
|||
fgetfilelines(pathname, buffer, lines);
|
||||
return lines;
|
||||
}
|
||||
std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer);
|
||||
std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, int numberOfTries = 1);
|
||||
|
||||
}}
|
||||
|
||||
|
|
|
@ -1251,7 +1251,7 @@ public:
|
|||
// BUGBUG: we only really support one archive file at this point
|
||||
// read the TOC in one swoop
|
||||
std::vector<char> textbuffer;
|
||||
auto toclines = msra::files::fgetfilelines(tocpath, textbuffer);
|
||||
auto toclines = msra::files::fgetfilelines(tocpath, textbuffer, 3);
|
||||
|
||||
// parse it one by one
|
||||
size_t archiveindex = SIZE_MAX; // its index
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#endif
|
||||
#define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1
|
||||
#include "Basics.h"
|
||||
#include "basetypes.h" //for attemp()
|
||||
#include "fileutil.h"
|
||||
#include "ProgressTracing.h"
|
||||
|
||||
|
@ -1632,6 +1633,11 @@ static size_t fgetfilechars(const std::wstring& path, vector<char>& buffer)
|
|||
return len;
|
||||
}
|
||||
|
||||
static void fgetfilechars(const std::wstring& path, vector<char>& buffer, size_t& len)
|
||||
{
|
||||
len = fgetfilechars(path, buffer);
|
||||
}
|
||||
|
||||
template <class LINES>
|
||||
static void strtoklines(char* s, LINES& lines)
|
||||
{
|
||||
|
@ -1639,10 +1645,14 @@ static void strtoklines(char* s, LINES& lines)
|
|||
lines.push_back(p);
|
||||
}
|
||||
|
||||
void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines)
|
||||
void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines, int numberOfTries)
|
||||
{
|
||||
// load it into RAM in one huge chunk
|
||||
const size_t len = fgetfilechars(path, buffer);
|
||||
size_t len = 0;
|
||||
msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
|
||||
{
|
||||
// load it into RAM in one huge chunk
|
||||
fgetfilechars(path, buffer, len);
|
||||
});
|
||||
|
||||
// parse into lines
|
||||
lines.resize(0);
|
||||
|
@ -1651,11 +1661,15 @@ void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer,
|
|||
}
|
||||
|
||||
// same as above but returning const char* (avoiding the memory allocation)
|
||||
vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer)
|
||||
vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer, int numberOfTries)
|
||||
{
|
||||
// load it into RAM in one huge chunk
|
||||
const size_t len = fgetfilechars(path, buffer);
|
||||
|
||||
size_t len = 0;
|
||||
msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
|
||||
{
|
||||
// load it into RAM in one huge chunk
|
||||
fgetfilechars(path, buffer, len);
|
||||
});
|
||||
|
||||
// parse into lines
|
||||
vector<char*> lines;
|
||||
lines.reserve(len / 20);
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include "PreComputeNodes.h"
|
||||
#include "EvaluationNodes.h"
|
||||
#include "SpecialPurposeNodes.h"
|
||||
#include "DeprecatedNodes.h" // (for SaveToDbnFile(), which is also deprecated)
|
||||
#include "MPIWrapper.h" // TODO: does not belong here
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -391,13 +392,38 @@ void ComputationNetwork::Read(const wstring& fileName)
|
|||
// node construction
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// non-static version needed because it accesses m_randomSeedOffset
|
||||
// Excessively used by SimpleNetworkBuilder, but always after CreateLearnableParameter(), so we should really absorb it there
|
||||
template <class ElemType>
|
||||
void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly)
|
||||
// helper of InitLearnableParameters()
|
||||
// Note: This should really be done through an interface without <ElemType> that LearnableParameter would derive from.
|
||||
// However, this is only for NDL (which is deprecated), so I rather not pollute the code with more interfaces just for a deprecated cause.
|
||||
template<class ElemType>
|
||||
static bool TryPostInitParameters(const ComputationNodeBasePtr& node, const wchar_t* initString, double initValue, unsigned long randomSeed, bool initOnCPUOnly)
|
||||
{
|
||||
auto learnableParameterNode = dynamic_pointer_cast<LearnableParameter<ElemType>>(node);
|
||||
learnableParameterNode->InitRandom(uniformInit, randomSeed + GetRandomSeedOffset(), initValueScale, initOnCPUOnly);
|
||||
if (!learnableParameterNode)
|
||||
return false;
|
||||
learnableParameterNode->PostInitParameters(initString, (ElemType) initValue, randomSeed, initOnCPUOnly);
|
||||
return true;
|
||||
}
|
||||
|
||||
// non-static version needed because it accesses m_randomSeedOffset
|
||||
void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& node,
|
||||
const wchar_t* initString, // "uniform"|"gaussian"|"fixedValue"
|
||||
double initValue, // scale | scale | value
|
||||
unsigned long randomSeed /*= 0*/,
|
||||
bool initOnCPUOnly /*= false*/) const
|
||||
{
|
||||
randomSeed += GetRandomSeedOffset();
|
||||
if (TryPostInitParameters<float> (node, initString, initValue, randomSeed, initOnCPUOnly) ||
|
||||
TryPostInitParameters<double>(node, initString, initValue, randomSeed, initOnCPUOnly))
|
||||
return;
|
||||
LogicError("InitLearnableParameters: Input node is not a LearnableParameter<float or double>");
|
||||
}
|
||||
|
||||
// non-static version needed because it accesses m_randomSeedOffset
|
||||
// Legacy version that is for random only.
|
||||
void ComputationNetwork::RandomInitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly) const
|
||||
{
|
||||
InitLearnableParameters(node, uniformInit ? L"uniform" : L"gaussian", initValueScale, randomSeed, initOnCPUOnly);
|
||||
}
|
||||
|
||||
bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
|
||||
|
@ -714,35 +740,22 @@ void ComputationNetwork::DescribeNetworkUsingDot(list<ComputationArc>& arcs,
|
|||
|
||||
File fstream(outFile, FileOptions::fileOptionsText | FileOptions::fileOptionsWrite);
|
||||
|
||||
// get precompute node
|
||||
vector<ComputationNodeBasePtr> PreComputedNodes;
|
||||
vector<ComputationNodeBasePtr> preComputedNodes;
|
||||
vector<ComputationNodeBasePtr> pastValueNodes;
|
||||
vector<ComputationNodeBasePtr> futureValueNodes;
|
||||
vector<ComputationNodeBasePtr> learnableParameters;
|
||||
vector<ComputationNodeBasePtr> allnodes = GetAllNodes();
|
||||
for (const auto& n : allnodes)
|
||||
{
|
||||
if (n->RequiresPreCompute())
|
||||
PreComputedNodes.push_back(n);
|
||||
}
|
||||
preComputedNodes.push_back(n);
|
||||
|
||||
// get PastValue node
|
||||
vector<ComputationNodeBasePtr> pastValueNodes;
|
||||
for (const auto& n : allnodes)
|
||||
{
|
||||
if (n->OperationName() == OperationNameOf(PastValueNode) || n->OperationName() == L"Delay")
|
||||
const auto operationName = n->OperationName();
|
||||
if (operationName == OperationNameOf(PastValueNode) || operationName == L"Delay"/*legacy*/)
|
||||
pastValueNodes.push_back(n);
|
||||
}
|
||||
|
||||
// get FuturetValue node
|
||||
vector<ComputationNodeBasePtr> futureValueNodes;
|
||||
for (const auto& n : allnodes)
|
||||
{
|
||||
if (n->OperationName() == OperationNameOf(FutureValueNode))
|
||||
else if (operationName == OperationNameOf(FutureValueNode))
|
||||
futureValueNodes.push_back(n);
|
||||
}
|
||||
// get learnableParameters
|
||||
vector<ComputationNodeBasePtr> learnableParameters;
|
||||
for (const auto& n : allnodes)
|
||||
{
|
||||
if (n->OperationName() == OperationNameOf(LearnableParameter))
|
||||
else if (operationName == OperationNameOf(LearnableParameter))
|
||||
learnableParameters.push_back(n);
|
||||
}
|
||||
|
||||
|
@ -763,7 +776,7 @@ void ComputationNetwork::DescribeNetworkUsingDot(list<ComputationArc>& arcs,
|
|||
// critera
|
||||
fstream << FormSpecialNodes(dotcfg.m_CriteriaStyle, m_criterionNodes);
|
||||
// pre-compute nodes
|
||||
fstream << FormSpecialNodes(dotcfg.m_PrecomputingNodeStyle, PreComputedNodes);
|
||||
fstream << FormSpecialNodes(dotcfg.m_PrecomputingNodeStyle, preComputedNodes);
|
||||
// PastValue nodes
|
||||
fstream << FormSpecialNodes(dotcfg.m_pastValueNodeStyle, pastValueNodes);
|
||||
// FutureValue nodes
|
||||
|
@ -1062,10 +1075,12 @@ void ComputationNetwork::PerformSVDecomposition(const map<wstring, float>& SVDCo
|
|||
wstring rightChildName = name + L"_V";
|
||||
shared_ptr<ComputationNode<ElemType>> pLeft = AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(m_deviceId, leftChildName, m, r));
|
||||
shared_ptr<ComputationNode<ElemType>> pRight = AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(m_deviceId, rightChildName, r, n));
|
||||
InitLearnableParameters(pLeft, L"fixedValue", 0); // follow the protocol; otherwise deferred initialization will overwrite the SVD values in validation
|
||||
InitLearnableParameters(pRight, L"fixedValue", 0);
|
||||
|
||||
// TODO: We should be able to move instead of copy but it currently isn't straightforward
|
||||
// due to redU and redVT being slices
|
||||
pLeft->ValueAsMatrix() = redU.DeepClone();
|
||||
pLeft->ValueAsMatrix() = redU.DeepClone();
|
||||
pRight->ValueAsMatrix() = redVT.DeepClone();
|
||||
|
||||
// Step 3. Change the network hierachy to include the SVD nodes
|
||||
|
@ -1111,7 +1126,7 @@ public:
|
|||
~DbnLayer() {};
|
||||
};
|
||||
|
||||
// Save network in the format of the Microsoft-internal legacy "DBN.exe" tool (this function is not useful outside of Microsoft)
|
||||
// Save network in the format of the Microsoft-internal legacy "DBN.exe" tool (this function is not useful outside of Microsoft).
|
||||
template <class ElemType>
|
||||
void ComputationNetwork::SaveToDbnFile(ComputationNetworkPtr net, const std::wstring& fileName) const
|
||||
{
|
||||
|
@ -1463,7 +1478,6 @@ void ComputationNetwork::SaveToDbnFile(ComputationNetworkPtr net, const std::wst
|
|||
PutTag("EDBN");
|
||||
}
|
||||
|
||||
template void ComputationNetwork::InitLearnableParameters<float>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const float initValueScale, bool initOnCPUOnly);
|
||||
template void ComputationNetwork::Read<float>(const wstring& fileName);
|
||||
template void ComputationNetwork::ReadPersistableParameters<float>(File& fstream, bool create);
|
||||
template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
|
||||
|
@ -1473,7 +1487,6 @@ template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net,
|
|||
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
|
||||
template void ComputationNetwork::SaveToDbnFile<float>(ComputationNetworkPtr net, const std::wstring& fileName) const;
|
||||
|
||||
template void ComputationNetwork::InitLearnableParameters<double>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly);
|
||||
template void ComputationNetwork::Read<double>(const wstring& fileName);
|
||||
template void ComputationNetwork::ReadPersistableParameters<double>(File& fstream, bool create);
|
||||
template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
|
||||
|
|
|
@ -332,14 +332,15 @@ public:
|
|||
// node construction
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// non-static version needed because it accesses m_randomSeedOffset
|
||||
// Excessively used by SimpleNetworkBuilder, but always after CreateLearnableParameter(), so we should really absorb it there
|
||||
template <class ElemType>
|
||||
// this function is only for use by NDL (deprecated)
|
||||
void InitLearnableParameters(const ComputationNodeBasePtr& node,
|
||||
const bool uniformInit,
|
||||
const unsigned long randomSeed,
|
||||
const ElemType initValueScale,
|
||||
bool initOnCPUOnly = false);
|
||||
const wchar_t* initString, // "uniform"|"gaussian"|"fixedValue"
|
||||
double initValue, // scale | scale | value
|
||||
unsigned long randomSeed = 0,
|
||||
bool initOnCPUOnly = false) const;
|
||||
// non-static version needed because it accesses m_randomSeedOffset
|
||||
// Legacy version that is for random only.
|
||||
void RandomInitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly = false) const;
|
||||
|
||||
template <typename N>
|
||||
static shared_ptr<N> AsNodePtr(const ComputationNodeBasePtr& inode)
|
||||
|
@ -522,6 +523,8 @@ public:
|
|||
}
|
||||
|
||||
|
||||
const std::vector<ComputationNodeBasePtr>& RootNodes() const { return m_allRoots; }
|
||||
|
||||
// these are specified as such by the user
|
||||
const std::vector<ComputationNodeBasePtr>& FeatureNodes() const { return m_featureNodes ; }
|
||||
const std::vector<ComputationNodeBasePtr>& LabelNodes() const { return m_labelNodes ; }
|
||||
|
@ -751,7 +754,7 @@ public:
|
|||
while (!result.second/*if already there*/ && result.first->second != node)
|
||||
{
|
||||
if (!makeUniqueName || node->NodeName().find_first_of(L".[]") == wstring::npos)
|
||||
RuntimeError("AddNodeToNetIfNotYet: Duplicated name for %ls %ls operation.", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
RuntimeError("AddNodeToNetIfNotYet: Duplicated name for %ls %ls operation (%d vs. %d).", node->NodeName().c_str(), node->OperationName().c_str(), (int)node->m_uniqueNumericId, (int)result.first->second->m_uniqueNumericId);
|
||||
node->SetName(L"_" + node->NodeName());
|
||||
result = m_nameToNodeMap.insert(make_pair(node->NodeName(), node));
|
||||
}
|
||||
|
@ -1034,7 +1037,7 @@ public:
|
|||
// data members
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
unsigned long GetRandomSeedOffset()
|
||||
unsigned long GetRandomSeedOffset() const
|
||||
{
|
||||
return m_randomSeedOffset;
|
||||
}
|
||||
|
|
|
@ -106,13 +106,13 @@ void ComputationNetwork::FormRecurrentLoops(const ComputationNodeBasePtr& rootNo
|
|||
assert(node->m_numNonDelayedParentsInLoop == 0); // (in PurgeStateForFormingRecurrentLoops())
|
||||
}
|
||||
for (let& node : nestedNodes)
|
||||
{
|
||||
{
|
||||
for (auto& input : node->GetInputs())
|
||||
{
|
||||
{
|
||||
if (input->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0/*not a Delay node*/)
|
||||
input->m_numNonDelayedParentsInLoop++; // cound #parents of 'input' that are not delay nodes
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// re-traverse the graph for all nestedNodes, starting with the first
|
||||
// Then update m_nestedNodes with the re-traversed order.
|
||||
|
|
|
@ -76,7 +76,7 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,
|
|||
|
||||
ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);
|
||||
|
||||
for (const auto& fromNode : GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
|
||||
for (const auto& fromNode : fromNet.GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
|
||||
{
|
||||
wstring fromNodeName = fromNode->NodeName();
|
||||
wstring toNodeName = toNamePrefix + fromNodeName;
|
||||
|
|
|
@ -885,9 +885,9 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
|
|||
if (performingBackPropagation)
|
||||
{
|
||||
if (outputValueNeededDuringBackProp.find(input) == outputValueNeededDuringBackProp.end())
|
||||
outputValueNeededDuringBackProp[input] = input->OutputUsedInComputingInputNodesGradients();
|
||||
outputValueNeededDuringBackProp[input] = input->NeedsGradient() && input->OutputUsedInComputingInputNodesGradients();
|
||||
|
||||
outputValueNeededDuringBackProp[input] |= node->InputUsedInComputingInputNodesGradients(i);
|
||||
outputValueNeededDuringBackProp[input] |= (node->NeedsGradient() && node->InputUsedInComputingInputNodesGradients(i));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -1,21 +1,12 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\Common\File.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\fileutil.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="ComputationNode.cpp">
|
||||
<Filter>Nodes</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="stdafx.cpp">
|
||||
<Filter>Misc</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\TimerUtility.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\BestGpu.cpp">
|
||||
<Filter>GPU Interfacing</Filter>
|
||||
</ClCompile>
|
||||
|
|
|
@ -380,4 +380,295 @@ public:
|
|||
|
||||
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNetworkWithEdits> registerComputationNetworkWithEdits(L"ComputationNetworkWithEdits");
|
||||
|
||||
// ===================================================================
|
||||
// CloneFunctionConfigLambda -- lambda to produce a clone of a network
|
||||
// - creates a BrainScript function that carbon-copies a subsection of an existing network
|
||||
// - the copy can be shallow or deep, where a deep copy gets its own copy of LearnableParameters
|
||||
// - a shallow copy (parameters="shared") is a copy of all nodes that depend on the specified input(s),
|
||||
// while all other nodes are shared from the original network section
|
||||
// - a deep copy (parameters="lernable" or "constant") also copies all reachable LearnableParameters and their dependents
|
||||
// - Input() nodes not listed as `inputNodes` are always shared
|
||||
// - the source network may be a different network, e.g. loaded with BS.Network.Load()
|
||||
// - a deep copy can be read-only (parameters="constant")
|
||||
// - Note: multiple uses of the lambda will not share read-only parameters. This is trickier to implement that one might expect.
|
||||
// - example use cases:
|
||||
// - adaptation (KL): a frozen read-only copy of the starting model is used as a KL-regularizer
|
||||
// - adaptation (DLR): an injected input transform is trained while the network is fixed
|
||||
// - image: lower layers of ImageNet networks serve as immutable feature extractors for another image task
|
||||
// - DSSM: applying the same network subsection to two inputs
|
||||
// Usage:
|
||||
// f = CloneFunction (inputNodes, outputNodes, parameters="lernable" /*|"constant"|"shared"*/)
|
||||
// Parameters:
|
||||
// - inputNodes: single node or array of nodes that will become parameters of the function.
|
||||
// Commonly, this list will include all Input()s that the outputNode(s) depend on.
|
||||
// - outputNodes: single node or dictionary of nodes that the function will emit
|
||||
// Example:
|
||||
// # create a BS function by copying a piece of network
|
||||
// net = CloneFunction (network.features, network.logP)
|
||||
// # apply the copy to a new input
|
||||
// out = net (myFeatures)
|
||||
// # This will create a copy of the subsection from network.features to network.logP
|
||||
// # where all links to network.features get replaced by links to myFeatures.
|
||||
// Example with multiple input and output nodes:
|
||||
// # create a BS function by copying a piece of network
|
||||
// # This specific example converts a network back into a BrainScript function.
|
||||
// # It passes two input nodes --> the BS function will have 2 inputs;
|
||||
// # and it passes a record of output nodes --> the BS function will return a record with the same member names
|
||||
// network = BS.Network.Load ("some.dnn")
|
||||
// net = CloneFunction ((network.features:network.labels), [ ce = network.ce ; errs = network.errs ])
|
||||
// # create a network from the BS function
|
||||
// features = Input (13)
|
||||
// labels = Input (42)
|
||||
// out = net (features, labels)
|
||||
// criterionNodes = (out.ce)
|
||||
// evaluationNodes = (out.errs)
|
||||
// A specific example: Adapting a network, while using the original network as a regularizer (KLD)
|
||||
// # load network
|
||||
// network = BS.Network.Load ("some.dnn")
|
||||
// # create a trainable clone and a read-only reference clone
|
||||
// adaptNet = CloneFunction (network.features, [ z = network.z ], readOnly=false)
|
||||
// # create a read-only clone
|
||||
// refNet = CloneFunction (network.features, [ z = network.z ], readOnly=true)
|
||||
// # create the main network
|
||||
// features = Input (42)
|
||||
// labels = Input (9000)
|
||||
// z = adaptNet (features).z
|
||||
// zRef = refNet (features).z
|
||||
// # training criterion
|
||||
// refWeight = 0.9
|
||||
// kldLabels = labels * (1-refWeight) + Softmax (zRef) * refWeight # interpolate with ref output
|
||||
// ce = CrossEntropyWithSoftmax (z, kldLabels)
|
||||
// errs = ErrorPrediction (z, labels)
|
||||
// criterionNodes = (ce)
|
||||
// evaluationNodes = (errs)
|
||||
// ===================================================================
|
||||
|
||||
class CloneFunctionConfigLambda : public ConfigLambda
|
||||
{
|
||||
// how we treat the parameters in the clone
|
||||
enum class ParameterTreatment
|
||||
{
|
||||
learnable, // parameters are copied and kept trainable
|
||||
constant, // parameters are copied and made immutable (e.g. for use of this as a fixed feature extractor)
|
||||
shared // parameters are shared with where they came from (e.g. for parallel identical paths through a network)
|
||||
};
|
||||
public:
|
||||
// -----------------------------------------------------------------------
|
||||
// construction
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// Executing this function from BrainScript merely sets up a lambda, but does not actually create any clone.
|
||||
// This is so that the function can be called multiple times in order to create multiple clones.
|
||||
CloneFunctionConfigLambda(const IConfigRecordPtr configp) :
|
||||
ConfigLambda(CreateParamNames(*configp), NamedParams(), [this](vector<ConfigValuePtr> &&args, NamedParams &&namedArgs, const std::wstring &exprName){ return this->DoClone(args, exprName); })
|
||||
{
|
||||
let& config = *configp;
|
||||
// input nodes
|
||||
inputNodes = GetInputNodes(config);
|
||||
// output nodes
|
||||
let outputNodesParam = config[L"outputNodes"]; // can be a node or a record
|
||||
if (outputNodesParam.Is<ComputationNodeBase>()) // scalar case: result is a single node
|
||||
outputNodes[L""] = outputNodesParam.AsPtr<ComputationNodeBase>(); // indicated by a "" node name in outputNodes[]
|
||||
else // multi-valued case: result is a record of nodes
|
||||
{
|
||||
let& outputNodesRecord = outputNodesParam.AsRef<IConfigRecord>();
|
||||
for (let& nodeName : outputNodesRecord.GetMemberIds())
|
||||
outputNodes[nodeName] = outputNodesRecord[nodeName].AsPtr<ComputationNodeBase>();
|
||||
if (outputNodes.empty())
|
||||
InvalidArgument("CloneFunction: At least one output nodes must be specified.");
|
||||
}
|
||||
// treatment of parameters
|
||||
wstring parametersOption = config[L"parameters"];
|
||||
if (parametersOption == L"learnable") parameterTreatment = ParameterTreatment::learnable;
|
||||
else if (parametersOption == L"constant") parameterTreatment = ParameterTreatment::constant;
|
||||
else if (parametersOption == L"shared") parameterTreatment = ParameterTreatment::shared;
|
||||
else InvalidArgument("CloneFunction: 'parameters' option must be 'learnable', 'constant', or 'shared'.");
|
||||
|
||||
// determine which nodes must be cloned
|
||||
// - intersection of:
|
||||
// - all indirect inputs of the specified outputs
|
||||
// - all dependents of leaves
|
||||
// - where leaves are:
|
||||
// - specified inputs
|
||||
// - unless parameters="shared": all parameters the specified outputs depend on
|
||||
|
||||
// determine all indirect inputs of the specified outputs
|
||||
vector<ComputationNodeBasePtr> roots;
|
||||
for (let& outputNodeKV : outputNodes)
|
||||
roots.push_back(outputNodeKV.second);
|
||||
let allInputs = ComputationNodeBase::EnumerateNodes(roots);
|
||||
|
||||
// take the chance to validate inputNodes
|
||||
let allInputsSet = set<ComputationNodeBasePtr>(allInputs.begin(), allInputs.end());
|
||||
for (let& input : inputNodes)
|
||||
if (allInputsSet.find(input) == allInputsSet.end())
|
||||
InvalidArgument("CloneFunction: No specified output depends on the specified input %ls.", input->NodeDescription().c_str());
|
||||
// TODO: Is this really always an error? Are there valid cases where one would over-specify possible input nodes, even if they are not used/needed?
|
||||
|
||||
// determine all leaves and their dependents
|
||||
dependentSet = set<ComputationNodeBasePtr>(inputNodes.begin(), inputNodes.end()); // start with the specified inputs
|
||||
// determine all leaves and their dependents
|
||||
for (let& node : allInputs)
|
||||
{
|
||||
// add parameters that are to be cloned to dependent set
|
||||
if (parameterTreatment != ParameterTreatment::shared && node->Is<IFreezable>())
|
||||
dependentSet.insert(node);
|
||||
// if at least one input is in the dependent set then this node is, too
|
||||
else
|
||||
for (let& input : node->GetInputs())
|
||||
if (dependentSet.find(input) != dependentSet.end())
|
||||
dependentSet.insert(node);
|
||||
}
|
||||
|
||||
#if 0
|
||||
for (let& node : dependentSet)
|
||||
fprintf(stderr, "CloneFunction: cloning %ls\n", node->NodeDescription().c_str());
|
||||
#endif
|
||||
|
||||
// ensure none of the specified inputs reference back into the cloned set
|
||||
// The function we extract must be separable.
|
||||
for (let& input : inputNodes)
|
||||
for (let& node : ComputationNodeBase::EnumerateNodes(vector<ComputationNodeBasePtr>{input})) // check all indirect inputs of each specified input
|
||||
{
|
||||
let iter = dependentSet.find(input);
|
||||
if (iter != dependentSet.end() && *iter != input)
|
||||
InvalidArgument("CloneFunction: specified function input %ls recursively depends on %ls inside the function.", input->NodeDescription().c_str(), node->NodeDescription().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// get the input nodes from the config
|
||||
static vector<ComputationNodeBasePtr> GetInputNodes(const IConfigRecord& config)
|
||||
{
|
||||
return ScriptableObjects::ConfigArray::FlattenedVectorFrom<ComputationNodeBasePtr>(config[L"inputNodes"]);
|
||||
}
|
||||
// create an array of parameter names for all inputs
|
||||
// These names are never actually used, but required by the ConfigLambda constructor, and maybe useful for debugging.
|
||||
static vector<wstring> CreateParamNames(const IConfigRecord& config)
|
||||
{
|
||||
let inputNodes = GetInputNodes(config);
|
||||
vector<wstring> paramNames(inputNodes.size());
|
||||
for (size_t i = 0; i < paramNames.size(); i++)
|
||||
paramNames[i] = msra::strfun::wstrprintf(L"input_%d", (int)i);
|
||||
return paramNames;
|
||||
}
|
||||
|
||||
private:
|
||||
// -----------------------------------------------------------------------
|
||||
// the cloning operation itself
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// execute the lambda
|
||||
// This will clone all nodes that the outputNodes depend on, and rewire inputs matching inputNodes to inputArgs.
|
||||
ConfigValuePtr DoClone(const vector<ConfigValuePtr>& inputValues, const std::wstring& exprName)
|
||||
{
|
||||
// resolve the input arguments
|
||||
vector<ComputationNodeBasePtr> inputs;
|
||||
for (let& inputValue : inputValues)
|
||||
inputs.push_back(inputValue.ResolveValue());
|
||||
assert(inputValues.size() == inputNodes.size()); // (this should have been checked by BrainScript)
|
||||
|
||||
// do some logging
|
||||
fprintf(stderr, "CloneFunction: ");
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
fprintf(stderr, "%s%ls : %ls", i == 0 ? "(" : ", ", inputs[i]->NodeName().c_str(), inputs[i]->OperationName().c_str());
|
||||
fprintf(stderr, ") -> ");
|
||||
let singleOutput = outputNodes.size() == 1 && outputNodes.begin()->first.empty();
|
||||
if (singleOutput)
|
||||
fprintf(stderr, "%ls\n", outputNodes.begin()->second->NodeDescription().c_str());
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "[\n");
|
||||
for (let& outputNodesKV : outputNodes)
|
||||
fprintf(stderr, " %ls = %ls : %ls\n", outputNodesKV.first.c_str(), outputNodesKV.second->NodeName().c_str(), outputNodesKV.second->OperationName().c_str());
|
||||
fprintf(stderr, "]\n");
|
||||
}
|
||||
|
||||
// clone everything in the dependent set
|
||||
// - specified inputs get mapped to actual parameters
|
||||
// - all others get duplicated
|
||||
// Note that at this point, the "shared" option has already been considered,
|
||||
// and is reflected in whether parameters are included or not in 'dependentSet'.
|
||||
map<ComputationNodeBasePtr, ComputationNodeBasePtr> clonedNodes;
|
||||
size_t numCloned = 0;
|
||||
for (size_t i = 0; i < inputNodes.size(); i++)
|
||||
clonedNodes[inputNodes[i]] = inputs[i];
|
||||
for (let& node : dependentSet)
|
||||
{
|
||||
// if already there then it's an input that we just mapped above
|
||||
if (clonedNodes.find(node) != clonedNodes.end())
|
||||
continue;
|
||||
// clone
|
||||
ComputationNodeBasePtr newNode;
|
||||
let newName = exprName + L"." + node->GetName();
|
||||
newNode = node->Duplicate(newName, CopyNodeFlags::copyNodeAll);
|
||||
// make it read-only if desired
|
||||
if (parameterTreatment == ParameterTreatment::constant && newNode->Is<IFreezable>())
|
||||
newNode->As<IFreezable>()->FreezeParameters();
|
||||
// and that's our cloned node
|
||||
clonedNodes[node] = newNode;
|
||||
numCloned++;
|
||||
}
|
||||
#if 0
|
||||
for (let& nodeKV : clonedNodes)
|
||||
fprintf(stderr, "CloneFunction: cloning %ls -> %ls (%d -> %d)\n", nodeKV.first->NodeDescription().c_str(), nodeKV.second->NodeDescription().c_str(), (int)nodeKV.first->m_uniqueNumericId, (int)nodeKV.second->m_uniqueNumericId);
|
||||
#endif
|
||||
|
||||
// all cloned nodes' inputs must be redirected if they reference a node that has been cloned as well
|
||||
size_t numRelinks = 0; // (statistics: how many inputs have we relinked?)
|
||||
for (let& clonedNodesKV : clonedNodes)
|
||||
{
|
||||
let& node = clonedNodesKV.second;
|
||||
let& inputs = node->GetInputs();
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
fprintf(stderr, "%ls.inputs[%d] = %ls (%d)", node->NodeName().c_str(), (int)i, inputs[i]->NodeName().c_str(), (int)inputs[i]->m_uniqueNumericId);
|
||||
let iter = clonedNodes.find(inputs[i]);
|
||||
if (iter == clonedNodes.end())
|
||||
continue;
|
||||
// input is also a cloned node: relink
|
||||
node->SetInput(i, iter->second);
|
||||
fprintf(stderr, " ==> %ls (%d)\n", inputs[i]->NodeName().c_str(), (int)inputs[i]->m_uniqueNumericId);
|
||||
numRelinks++;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "CloneFunction: Cloned %d nodes and relinked %d inputs.\n", (int)numCloned, (int)numRelinks);
|
||||
|
||||
// return the result
|
||||
// - if outputNodes was specified as a single node, return a single node
|
||||
// - if specified as a record, then return a record with the specified names
|
||||
|
||||
if (singleOutput)
|
||||
{
|
||||
return NodeToConfigValuePtr(clonedNodes.find(outputNodes.begin()->second)->second);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto record = make_shared<ConfigRecord>(nullptr, [](const std::wstring & msg){ RuntimeError("CloneFunction: %ls", msg.c_str()); });
|
||||
for (let& outputNodesKV : outputNodes)
|
||||
record->Add(outputNodesKV.first, [](const wstring&){}, move(NodeToConfigValuePtr(clonedNodes.find(outputNodesKV.second)->second)));
|
||||
auto valuep = ConfigValuePtr(record, [](const std::wstring &) { LogicError("CloneFunction: Unexpected failure."); }, exprName);
|
||||
return valuep;
|
||||
}
|
||||
}
|
||||
|
||||
ConfigValuePtr NodeToConfigValuePtr(ComputationNodeBasePtr node)
|
||||
{
|
||||
assert(node);
|
||||
auto valuep = ConfigValuePtr(node, [](const std::wstring &) { LogicError("CloneFunction: Unexpected failure."); }, node->NodeName());
|
||||
return valuep;
|
||||
}
|
||||
|
||||
private:
|
||||
// parameters
|
||||
vector<ComputationNodeBasePtr> inputNodes;
|
||||
map<wstring, ComputationNodeBasePtr> outputNodes;
|
||||
ParameterTreatment parameterTreatment;
|
||||
// other
|
||||
set<ComputationNodeBasePtr> dependentSet; // set of nodes that outputNodes depend on
|
||||
};
|
||||
|
||||
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<CloneFunctionConfigLambda> registerCloneFunctionConfigLambda(L"CloneFunctionConfigLambda");
|
||||
|
||||
}}}
|
||||
|
|
|
@ -38,7 +38,8 @@
|
|||
#define CNTK_MODEL_VERSION_7 7 // ElemType tag in model file
|
||||
#define CNTK_MODEL_VERSION_8 8 // DynamicAxis for inputs
|
||||
#define CNTK_MODEL_VERSION_9 9 // Transpose flag in ConvolutionNode to support deconvolution.
|
||||
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_9
|
||||
#define CNTK_MODEL_VERSION_10 10 // Learning rate multiplier for input nodes.
|
||||
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_10
|
||||
|
||||
extern bool g_shareNodeValueMatrices;
|
||||
|
||||
|
@ -184,7 +185,7 @@ protected: // TODO: should be fully encapsulated here
|
|||
bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
|
||||
|
||||
bool m_valueSharable; // a flag is needed for memory share.
|
||||
// If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters),
|
||||
// If it is false (e.g., LearnableParameters/InputValue and those nodes are solely induced by LearnableParameters),
|
||||
// it will never be released to memory pool
|
||||
private:
|
||||
bool m_isPartOfLoop; // true if this loop is part of a recurrent loop
|
||||
|
@ -289,6 +290,9 @@ public:
|
|||
m_gradientInitialized(false), m_nodeName(name == L"" ? CreateUniqNodeName() : name)
|
||||
{
|
||||
// TODO: should m_learningRateMultiplier be set to 0? Or should every node have a way to add its own say on the learning rate for all its inputs?
|
||||
// we store a unique numeric number for every node that is constructed, as a debugging aid
|
||||
static size_t uniqueNumericId = 0;
|
||||
m_uniqueNumericId = uniqueNumericId++;
|
||||
}
|
||||
virtual ~ComputationNodeBase()
|
||||
{
|
||||
|
@ -429,7 +433,18 @@ private:
|
|||
{
|
||||
if (HasMBLayout())
|
||||
LogicError("%ls: Minibatch data cannot be interpreted as a single 2D tensor.", NodeDescription().c_str());
|
||||
else if (m_sampleLayout.GetRank() < 1 || m_sampleLayout.GetRank() > 2) // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
|
||||
|
||||
bool notFlattenableTo2D = false;
|
||||
for (size_t i = 2; i < m_sampleLayout.GetRank(); ++i)
|
||||
{
|
||||
if (!m_sampleLayout.CanFlatten(i))
|
||||
{
|
||||
notFlattenableTo2D = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (m_sampleLayout.GetRank() < 1 || ((m_sampleLayout.GetRank() > 2) && notFlattenableTo2D)) // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
|
||||
LogicError("%ls: Sample [%s] is not a column vector or matrix (1D or 2D tensor).", NodeDescription().c_str(), string(m_sampleLayout).c_str());
|
||||
}
|
||||
public:
|
||||
|
@ -441,7 +456,11 @@ public:
|
|||
size_t GetAsMatrixNumCols() const
|
||||
{
|
||||
CheckTensorIsMatrix();
|
||||
return m_sampleLayout.GetRank() > 1 ? m_sampleLayout[1] : 1; // a column vector is also a Matrix
|
||||
auto flattenedLayout = m_sampleLayout;
|
||||
if (flattenedLayout.GetRank() > 2)
|
||||
flattenedLayout.FlattenTo2DInPlace(1, "GetAsMatrixNumCols()");
|
||||
|
||||
return flattenedLayout.GetRank() > 1 ? flattenedLayout[1] : 1; // a column vector is also a Matrix
|
||||
}
|
||||
|
||||
// setting/updating the dimensions of the node
|
||||
|
@ -574,8 +593,8 @@ public:
|
|||
else // a whole vector
|
||||
{
|
||||
ScriptableObjects::ConfigArrayPtr inputsArray = *inputsArg;
|
||||
const auto range = inputsArray->GetIndexRange();
|
||||
for (int i = range.first; i <= range.second; i++) // pull them. This will resolve all of them.
|
||||
const auto range = inputsArray->GetIndexBeginEnd();
|
||||
for (int i = range.first; i < range.second; i++) // pull them. This will resolve all of them.
|
||||
inputs.push_back(inputsArray->At(i, [](const wstring&) { LogicError("GetInputs: out of bounds index while iterating??"); }));
|
||||
}
|
||||
}
|
||||
|
@ -833,6 +852,8 @@ public:
|
|||
// Helper that returns [a x b x c], including dynamic axes.
|
||||
const std::string ShapeDescription() const;
|
||||
|
||||
// debugging helper
|
||||
size_t m_uniqueNumericId; // (a unique handle for debugging)
|
||||
protected:
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -1891,6 +1912,13 @@ public:
|
|||
|
||||
struct IRecurrentNode { virtual int GetRecurrenceSteppingDirection() const = 0; };
|
||||
|
||||
// =======================================================================
|
||||
// IFreezable -- nodes that have parameters that can be frozen
|
||||
// e.g. if a trained model is to be used as a fixed feature extractor for another
|
||||
// =======================================================================
|
||||
|
||||
struct IFreezable { virtual void FreezeParameters() { } };
|
||||
|
||||
// =======================================================================
|
||||
// PreComputedNodeBase -- interface implemented by ComputationNodes that precompute
|
||||
// TODO: We can use this interface in more places.
|
||||
|
|
|
@ -139,6 +139,16 @@ public:
|
|||
fstream << "PoolKind: " << (int)m_poolKind << "\n";
|
||||
}
|
||||
|
||||
TensorShape KernelShape() const { return m_kernelShape; }
|
||||
TensorShape Strides() const { return m_stride; }
|
||||
std::vector<bool> Sharing() const { return m_sharing; }
|
||||
std::vector<bool> AutoPad() const { return m_autoPad; }
|
||||
TensorShape LowerPad() const { return m_lowerPad; }
|
||||
TensorShape UpperPad() const { return m_upperPad; }
|
||||
bool Transpose() const { return m_transpose; }
|
||||
size_t MaxTempMemSizeInSamples() const { return m_maxTempMemSizeInSamples; }
|
||||
PoolKind PoolingKind() const { return m_poolKind; }
|
||||
|
||||
protected:
|
||||
TensorShape m_kernelShape;
|
||||
TensorShape m_mapCount;
|
||||
|
@ -148,7 +158,7 @@ protected:
|
|||
TensorShape m_lowerPad;
|
||||
TensorShape m_upperPad;
|
||||
PoolKind m_poolKind;
|
||||
bool m_transpose;
|
||||
bool m_transpose; // means de-convolution ...I think
|
||||
ImageLayoutKind m_imageLayout;
|
||||
|
||||
size_t m_maxTempMemSizeInSamples;
|
||||
|
@ -339,6 +349,10 @@ public:
|
|||
size_t mapCount = m_mapCount.GetNumElements();
|
||||
size_t weightCols = kW * kH * inDims.m_numChannels;
|
||||
|
||||
// if mapCount is 0 then take it from the input matrix
|
||||
if (mapCount == 0)
|
||||
Input(0)->GetAsMatrixNumRows();
|
||||
|
||||
// check/infer input [0] (weights)
|
||||
// BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper.
|
||||
Input(0)->ValidateInferInputDimsFrom(TensorShape(mapCount, weightCols));
|
||||
|
|
|
@ -61,4 +61,109 @@ public:
|
|||
template class SumColumnElementsNode<float>;
|
||||
template class SumColumnElementsNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// (deprecated) PerDimMeanVarNormalizationNode (feature, mean, invStdDev)
|
||||
// Computes
|
||||
// output = (feature - mean) .* invStdDev
|
||||
// where mean and invStdDev are meant to be single elements while features
|
||||
// is minibatch data.
|
||||
// Deprecated since it can be trivially expressed in BrainScript.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
class PerDimMeanVarNormalizationNode : public ComputationNode<ElemType>, public NumInputs<3>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base;
|
||||
UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName()
|
||||
{
|
||||
return L"PerDimMeanVarNormalization";
|
||||
}
|
||||
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(PerDimMeanVarNormalizationNode);
|
||||
PerDimMeanVarNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: Base(deviceId, name)
|
||||
{
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
|
||||
{
|
||||
InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
|
||||
{
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto output = ValueTensorFor(rank, fr);
|
||||
auto input = Input(0)->ValueTensorFor(rank, fr);
|
||||
auto mean = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
auto invStdDev = Input(2)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
|
||||
output.AssignDifferenceOf(input, mean); // output = input - mean
|
||||
output.AssignElementwiseProductOf(output, invStdDev); // output *= invStdDev
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
|
||||
|
||||
Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
|
||||
Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
|
||||
|
||||
|
||||
#if 1
|
||||
// support for legacy models when the mean and variance vectors were stored as column vectors (N,1)
|
||||
// This code will copy the shape of Input(0) (source) to Input(1) and Input(2) (target) if:
|
||||
// 1. The source is a 3-tensor with shape 1x1xM
|
||||
// 2. The target is a vector (i.e., a 2-tensor with shape Nx1)
|
||||
// 3. Both targets have the same number of elements
|
||||
// 4. The number of elements in the target (N) is the same as the number of elements in the source (M)
|
||||
// Note: This is somewhat ugly [Jasha Droppo].
|
||||
|
||||
auto dimsA = Input(0)->GetSampleLayout().GetDims();
|
||||
auto dimsB = Input(1)->GetSampleLayout().GetDims();
|
||||
auto dimsC = Input(2)->GetSampleLayout().GetDims();
|
||||
|
||||
if (
|
||||
// Test condition 1.
|
||||
(dimsA.size() == 3 && dimsA[0] == 1 && dimsA[1] == 1) &&
|
||||
// Test condition 2.
|
||||
(dimsB.size() == 2 && dimsB[1] == 1) &&
|
||||
(dimsC.size() == 2 && dimsC[1] == 1) &&
|
||||
// Test condition 3. and condition 4.
|
||||
(dimsB[0] == dimsC[0] && dimsB[0] == dimsA[2])
|
||||
)
|
||||
{
|
||||
// for error messages
|
||||
string dimsBstring = string(Input(1)->GetSampleLayout());
|
||||
string dimsCstring = string(Input(2)->GetSampleLayout());
|
||||
|
||||
// reshape Input(1)
|
||||
Input(1)->SetDims(TensorShape(dimsA), false);
|
||||
fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of second input (%ls %ls operation) was patched to [%s] (from [%s])\n",
|
||||
NodeName().c_str(), OperationName().c_str(), Input(1)->NodeName().c_str(), Input(1)->OperationName().c_str(), string(Input(1)->GetSampleLayout()).c_str(), dimsBstring.c_str());
|
||||
|
||||
// reshape Input(2)
|
||||
Input(2)->SetDims(TensorShape(dimsA), false);
|
||||
fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of third input (%ls %ls operation) was patched to [%s] (from [%s])\n",
|
||||
NodeName().c_str(), OperationName().c_str(), Input(2)->NodeName().c_str(), Input(2)->OperationName().c_str(), string(Input(2)->GetSampleLayout()).c_str(), dimsCstring.c_str());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if (isFinalValidationPass)
|
||||
{
|
||||
if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
|
||||
InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
|
||||
}
|
||||
|
||||
SetDims(Input(0));
|
||||
}
|
||||
};
|
||||
|
||||
template class PerDimMeanVarNormalizationNode<float>;
|
||||
template class PerDimMeanVarNormalizationNode<double>;
|
||||
|
||||
}}}
|
||||
|
|
|
@ -18,47 +18,107 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// TODO: add -Node to the class name
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// BUGBUG: If called after random init, this will reset to 0.
|
||||
// TODO: Need to remember the init parameters, and do it here.
|
||||
template <class ElemType>
|
||||
void LearnableParameter<ElemType>::InitShape(const TensorShape& shape)
|
||||
{
|
||||
SetDims(shape, false);
|
||||
UpdateFunctionValuesSize(); // this allocates the matrix
|
||||
Value().SetValue(0); // TODO: invalidate instead
|
||||
Value().Invalidate();
|
||||
}
|
||||
|
||||
// constructor from config
|
||||
// Parameterization is a little wicked. An older version required to specify the type of initialization
|
||||
// ("uniform|fixedValue|gaussian|fromFile|fromLiteral") and then a parameter with a matching name.
|
||||
// Now, only the matching parameter is sufficient, making it less verbose.
|
||||
// - init="uniform|gaussian" (random init, scaled by arg initValueScale)
|
||||
// - init="zero"
|
||||
// - initValue=scalar --> initialize from this value
|
||||
// - initValue=array or nested array --> initialize from this value, infer dimensions --TODO: not implemented yet
|
||||
// - initFromFilePath="..." --> read from a data file. This infers the dimensions from the file.
|
||||
// deprecated:
|
||||
// - init="fixedValue", value from 'value' --deprecated in favor of just specifying initValue
|
||||
// - init="fromFile", value from 'initFromFilePath' --deprecated in favor of just specifying 'initFromFilePath'
|
||||
// - init="fromLiteral", value from 'initFromLiteral' --deprecated in favor of initValue=array expression
|
||||
// The forms that infer the dimensions have different BrainScript names. TODO: need one for fromFile
|
||||
// TODO: All forms that require specified dimensions but contain zeroes (to be updated by graph)
|
||||
// will need to do deferred initialization, or have a way to repeat it.
|
||||
template <class ElemType>
|
||||
LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
LearnableParameter(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"shape"))
|
||||
{
|
||||
// TODO: Change dimensions to take a generic tensor instead. That will be a (minor) breaking change that will require fix-ups when converting from NDL to BrainScript.
|
||||
AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
|
||||
// parameters[rows, [cols=1]] plus other optional parameters (learningRateMultiplier=[1|0|float], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float])
|
||||
AttachInputsFromConfig(configp, this->GetExpectedNumInputs()); // (we have none; this checks that none are provided)
|
||||
// Parameter{dims, other optional parameters: learningRateMultiplier=[1|0|float], init=[uniform|gaussian|], initValueScale=[1|float], initValue=[''|float], initFromFilePath=[''|string]}
|
||||
|
||||
// constant vs. parameter (with optional LR scaling)
|
||||
if (configp->Exists(L"learningRateMultiplier"))
|
||||
SetLearningRateMultiplier(configp->Get(L"learningRateMultiplier"));
|
||||
else if (configp->Exists(L"needsGradient") || configp->Exists(L"needGradient") || configp->Exists(L"computeGradient"))
|
||||
InvalidArgument("Deprecated parameter names needsGradient|needGradient|computeGradient are not supported in BrainScript. Use learningRateMultiplier instead.");
|
||||
|
||||
// initialization
|
||||
wstring initString = configp->Get(L"init");
|
||||
if (initString == L"fixedValue")
|
||||
Value().SetValue((ElemType) configp->Get(L"value"));
|
||||
else if (initString == L"uniform" || initString == L"gaussian")
|
||||
wstring initFromFilePath = configp->Get(L"initFromFilePath");
|
||||
let& initValue = configp->Get(L"initValue"); // may be empty string, scalar, or array
|
||||
// infer the type of the initial value from what other optional args are given
|
||||
if (initString.empty())
|
||||
{
|
||||
// TODO: add these options also to old NDL
|
||||
if (!initFromFilePath.empty()) // 'initFromFilePath' given --> initialize from file
|
||||
initString = L"fromFile"; // (note: this is only used internally; external use is deprecated)
|
||||
else if (!initValue.Is<ScriptableObjects::String>()) // 'initValue' given (not an empty string) --> initialize from value
|
||||
{
|
||||
if (initValue.Is<ScriptableObjects::Double>())
|
||||
initString = L"fromValue"; // (note: this is only used internally)
|
||||
else if (initValue.Is<ScriptableObjects::ConfigArray>())
|
||||
initString = L"fromValueArray"; // (note: this is only used internally)
|
||||
else
|
||||
InvalidArgument("'initValue' must be numerical");
|
||||
}
|
||||
else if (!initValue.AsRef<ScriptableObjects::String>().empty()) // it's a string: must be empty
|
||||
InvalidArgument("LearnableParameter: 'initValue' must be an empty string or not a string.");
|
||||
else // no pertinent optional arguments given: default to 'uniform'
|
||||
initString = L"uniform"; // default is uniform
|
||||
}
|
||||
// deferred variants
|
||||
// Deferred means that this kind of initialization is allowed when some dimensions are unspecified, and thus happens during Validate().
|
||||
if (initString == L"uniform" || initString == L"gaussian") // random init
|
||||
{
|
||||
m_initString = initString;
|
||||
// TODO: add more randomization types, and use a more meaningful scaling
|
||||
// Keras uses "normal" instead of "gaussian". We can use that here too to denote the one with sane scaling, and deprecate "gaussian" with a warning.
|
||||
static unsigned long randomSeed = 1;
|
||||
int forcedRandomSeed = configp->Get(L"randomSeed"); // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
|
||||
InitRandom((initString == L"uniform"), forcedRandomSeed < 0 ? randomSeed++ : (unsigned long) forcedRandomSeed, configp->Get(L"initValueScale"), configp->Get(L"initOnCPUOnly"));
|
||||
m_randomSeed = forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed;
|
||||
m_initValueScale = configp->Get(L"initValueScale");
|
||||
m_initOnCPUOnly = configp->Get(L"initOnCPUOnly");
|
||||
}
|
||||
else if (initString == L"fromFile")
|
||||
else if (initString == L"zero")
|
||||
{
|
||||
m_initString = L"fromValue";
|
||||
m_initValue = 0;
|
||||
}
|
||||
else if (initString == L"fromValue") // from 'initValue'
|
||||
{
|
||||
m_initString = initString;
|
||||
m_initValue = initValue;
|
||||
}
|
||||
// non-deferred variants
|
||||
// For the dimensions are always known at this point, so we don't need/want to have to save all those parameters.
|
||||
else if (initString == L"fromValueArray") // from 'initValue' which has array form
|
||||
InvalidArgument("'initValue' for arrays not yet implemented"); // array not yet implemented
|
||||
else if (initString == L"fromFile") // load from 'iniFromFilePath'
|
||||
{
|
||||
wstring initFromFilePath = configp->Get(L"initFromFilePath");
|
||||
if (initFromFilePath.empty())
|
||||
RuntimeError("initFromFilePath parameter must be provided when using \"fromFile\" initialization method");
|
||||
InitFromFile(initFromFilePath);
|
||||
m_initString.clear();
|
||||
}
|
||||
else if (initString == L"fromLiteral")
|
||||
// legacy
|
||||
else if (initString == L"fixedValue") // deprecated. Use initValue=... instead
|
||||
{
|
||||
m_initString = L"fromValue";
|
||||
m_initValue = (ElemType)configp->Get(L"value");
|
||||
}
|
||||
else if (initString == L"fromLiteral") // deprecated. Use initValue=array instead
|
||||
{
|
||||
wstring initFromLiteral = configp->Get(L"initFromLiteral");
|
||||
if (initFromLiteral.empty())
|
||||
|
@ -66,9 +126,49 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
|
|||
size_t numRows, numCols;
|
||||
auto array = File::LoadMatrixFromStringLiteral<ElemType>(msra::strfun::utf8(initFromLiteral), numRows, numCols);
|
||||
InitFromArray(array, numRows, numCols);
|
||||
m_initString.clear();
|
||||
}
|
||||
else
|
||||
RuntimeError("init must be one of the values of [ uniform | gaussian | fixedValue | fromFile ]");
|
||||
|
||||
// initialize
|
||||
// This will be repeated if the matrix gets resized due to dimension inference.
|
||||
LazyInitParameters();
|
||||
|
||||
if (!m_initString.empty())
|
||||
fprintf(stderr, "%ls: Initializating Parameter[%s] as %ls later when dimensions are fully known.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str());
|
||||
}
|
||||
|
||||
// variant of above from NDL. Must be called right after plain constructor.
|
||||
// This overwrites any pending deferred initialization with a new one.
|
||||
// Initialization is done immediately if all dimensions are already known, otherwise kept pending.
|
||||
template <class ElemType>
|
||||
void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString, // "uniform"|"gaussian"|"fixedValue"
|
||||
ElemType initValue, // scale | scale | value
|
||||
unsigned long randomSeed /*= 0*/,
|
||||
bool initOnCPUOnly /*= false*/)
|
||||
{
|
||||
if (initString == L"uniform" || initString == L"gaussian") // random init
|
||||
{
|
||||
m_initString = initString;
|
||||
m_randomSeed = randomSeed;
|
||||
m_initValueScale = initValue;
|
||||
m_initOnCPUOnly = initOnCPUOnly;
|
||||
}
|
||||
else if (initString == L"fixedValue") // from constant value
|
||||
{
|
||||
m_initString = L"fromValue";
|
||||
m_initValue = initValue;
|
||||
}
|
||||
else
|
||||
LogicError("PostInitParameters: invalid init string '%ls'", m_initString.c_str());
|
||||
|
||||
// initialize
|
||||
// This will be repeated if the matrix gets resized due to dimension inference.
|
||||
LazyInitParameters();
|
||||
|
||||
if (!m_initString.empty())
|
||||
fprintf(stderr, "%ls: Initializating Parameter[%s] as %ls later when dimensions are fully known.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str());
|
||||
}
|
||||
|
||||
// initialize with random numbers
|
||||
|
@ -162,9 +262,25 @@ void LearnableParameter<ElemType>::InitFromArray(const std::vector<ElemType>& ar
|
|||
VerifyDataSize(Value()); // sanity check
|
||||
}
|
||||
|
||||
// TODO: Move this error check there, since this is called only from one place.
|
||||
template <class ElemType>
|
||||
void LearnableParameter<ElemType>::ReviseFromFile(const std::wstring& reviseFromFilePath)
|
||||
{
|
||||
try
|
||||
{
|
||||
InitFromFile(reviseFromFilePath);
|
||||
}
|
||||
catch (const std::exception & e)
|
||||
{
|
||||
RuntimeError("ReviseFromFile: Failed to reload %ls %ls operation from file %ls: %s", NodeName().c_str(), OperationName().c_str(), reviseFromFilePath.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void LearnableParameter<ElemType>::Save(File& fstream) const /*override*/
|
||||
{
|
||||
if (!m_initString.empty())
|
||||
LogicError("LearnableParameter: Cannot Save() before deferred initialization has completed.");
|
||||
Base::Save(fstream);
|
||||
fstream << m_learningRateMultiplier;
|
||||
m_sampleLayout.Save(fstream);
|
||||
|
@ -204,12 +320,31 @@ void LearnableParameter<ElemType>::Load(File& fstream, size_t modelVersion) /*ov
|
|||
LoadValue(fstream);
|
||||
SetDims(sampleLayout, false); // note: call this after LoadValue() since LoadValue() overwrites m_sampleLayout
|
||||
VerifyDataSize(Value()); // sanity check
|
||||
|
||||
m_initString.clear(); // deferred initialization not possible after loading
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
/*virtual*/ void LearnableParameter<ElemType>::CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const /*override*/
|
||||
{
|
||||
Base::CopyTo(nodeP, newName, flags);
|
||||
if (flags & CopyNodeFlags::copyNodeValue)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<LearnableParameter<ElemType>>(nodeP);
|
||||
node->m_initString = m_initString;
|
||||
node->m_randomSeed = m_randomSeed;
|
||||
node->m_initValueScale = m_initValueScale;
|
||||
node->m_initOnCPUOnly = m_initOnCPUOnly;
|
||||
node->m_initValue = m_initValue;
|
||||
}
|
||||
}
|
||||
|
||||
// computation functions don't do anything for parameter nodes
|
||||
template <class ElemType>
|
||||
/*virtual*/ void LearnableParameter<ElemType>::UpdateFunctionMBSize() /*override*/
|
||||
{
|
||||
if (!m_initString.empty())
|
||||
LogicError("LearnableParameter: Deferred initialization has not been completed until first call to UpdateFunctionMBSize().");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -226,18 +361,70 @@ template <class ElemType>
|
|||
template <class ElemType>
|
||||
/*virtual*/ void LearnableParameter<ElemType>::Validate(bool isFinalValidationPass) /*override*/
|
||||
{
|
||||
//fprintf(stderr, "Validate %ls: called in init state '%ls' with dims [%s]\n", NodeDescription().c_str(), m_initString.c_str(), string(GetSampleLayout()).c_str());
|
||||
Base::Validate(isFinalValidationPass);
|
||||
m_pMBLayout = nullptr; // this node does not hold mini-batch data
|
||||
|
||||
// lazy init if we got a dimension now
|
||||
#if 0 // fake old buggy behavior before deferred initialization
|
||||
if (isFinalValidationPass && !m_initString.empty() && (m_initString != L"fromValue" || m_initValue != 0))
|
||||
{
|
||||
fprintf(stderr, "Validate: deferred '%ls' initialization patched to fromValue 0 for back compat\n", m_initString.c_str());
|
||||
m_initString = L"fromValue";
|
||||
m_initValue = 0;
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
// We call this here and in Validate(true), since we don't know which gets called first.
|
||||
// TODO: Actually this should never be needed, because each time dimensions change, we init.
|
||||
// So if we get here without fully-known dimensions, this call won't do anything either.
|
||||
if (isFinalValidationPass)
|
||||
LazyInitParameters();
|
||||
#endif
|
||||
}
|
||||
|
||||
// deferred initialization
|
||||
// We support a feature that some dimensions can be specified as 0, and get inferred.
|
||||
// This is only possible for initialization methods that do not come with their own dimensions
|
||||
// (such as initialization from an array literal).
|
||||
// When initialization succeeded (all dimensions known), the pending initialization is cleared.
|
||||
// This is called from constructor and InferInputDimsFrom().
|
||||
// BUGBUG: We cannot really enforce the calling sequence. Save() verifies that this has been cleared.
|
||||
// Note that this may be called AFTER Validate(true) (still during validation, but after final validation of this node).
|
||||
template <class ElemType>
|
||||
void LearnableParameter<ElemType>::LazyInitParameters()
|
||||
{
|
||||
// if no lazy init pending then we are done
|
||||
if (m_initString.empty())
|
||||
return;
|
||||
// if not all dimensions are known yet, we cannot proceed: keep it pending
|
||||
if (GetSampleLayout().GetNumElements() == 0)
|
||||
return;
|
||||
// OK, proceed
|
||||
if (m_initString == L"fromValue")
|
||||
{
|
||||
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %f.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initValue);
|
||||
Value().SetValue(m_initValue);
|
||||
}
|
||||
else if (m_initString == L"uniform" || m_initString == L"gaussian")
|
||||
{
|
||||
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, scale=%f, onCPU=%s).\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(), (int)m_randomSeed, m_initValueScale, m_initOnCPUOnly ? "true" : "false");
|
||||
InitRandom((m_initString == L"uniform"), m_randomSeed, m_initValueScale, m_initOnCPUOnly);
|
||||
}
|
||||
else
|
||||
LogicError("LearnableParameter: Invalid value of m_initString '%ls' for deferred initialization for %ls.", m_initString.c_str(), NodeDescription().c_str());
|
||||
// and remember that we are done
|
||||
m_initString.clear();
|
||||
}
|
||||
|
||||
// called from ComputationNode::ValidateInferInputDimsFrom()
|
||||
// In case of an error, this function just backs out without updating.
|
||||
// The caller must verify the dimensions.
|
||||
// This is a bit weird since it is called after this node has been Validated once.
|
||||
// BUGBUG: This will clear out any random initialization to 0. So currently this is not usable for most cases.
|
||||
template <class ElemType>
|
||||
void LearnableParameter<ElemType>::InferInputDimsFrom(const TensorShape& otherShape)
|
||||
{
|
||||
//fprintf(stderr, "InferInputDimsFrom %ls: called in init state '%ls' with dims [%s], offered new dims [%s]\n", NodeDescription().c_str(), m_initString.c_str(), string(GetSampleLayout()).c_str(), string(otherShape).c_str());
|
||||
const auto& thisShape = GetSampleLayout();
|
||||
|
||||
// see where we stand with our shape
|
||||
|
@ -248,7 +435,10 @@ void LearnableParameter<ElemType>::InferInputDimsFrom(const TensorShape& otherSh
|
|||
// infer at least one dimension
|
||||
if (otherShape.GetRank() == 0 || otherShape.GetNumElements() == 0)
|
||||
return; // LogicError("ValidateInferInputDimsFrom: Inferred dimensions must not be empty.");
|
||||
|
||||
|
||||
if (m_initString.empty())
|
||||
LogicError("InferInputDimsFrom: Attempted to infer dimensions, with initialization completed or no deferred initialization pending.");
|
||||
|
||||
// if no dimensions have been set at all, copy otherShape
|
||||
// Don't verify dimensions in this case, because the node may have explicitly been defined as a vector of 0 elements.
|
||||
bool hasAnyDim = false;
|
||||
|
@ -266,7 +456,20 @@ void LearnableParameter<ElemType>::InferInputDimsFrom(const TensorShape& otherSh
|
|||
newDims[i] = otherShape[i];
|
||||
InitShape(TensorShape(newDims));
|
||||
}
|
||||
fprintf(stderr, "%ls %ls operation: Tensor shape was inferred as [%s].\n", NodeName().c_str(), OperationName().c_str(), string(GetSampleLayout()).c_str());
|
||||
fprintf(stderr, "%ls operation: Tensor shape was inferred as [%s].\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str());
|
||||
|
||||
// initialize the values
|
||||
// We call this here and in Validate(true), since we don't know which gets called first.
|
||||
// Note: It seems that this is not necessary, and that Validate(true) is only called after inference.
|
||||
#if 0 // fake old buggy behavior before deferred initialization
|
||||
if (m_initString != L"fromValue" || m_initValue != 0)
|
||||
{
|
||||
fprintf(stderr, "InferInputDimsFrom: deferred '%ls' initialization patched to fromValue 0 for back compat\n", m_initString.c_str());
|
||||
m_initString = L"fromValue";
|
||||
m_initValue = 0;
|
||||
}
|
||||
#endif
|
||||
LazyInitParameters();
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -286,6 +489,12 @@ template <class ElemType>
|
|||
PrintNodeValuesToFile(printValues, printMetadata, fstream);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
/*virtual*/ void LearnableParameter<ElemType>::FreezeParameters() /*override*/ // from IFreezable
|
||||
{
|
||||
SetLearningRateMultiplier(0);
|
||||
}
|
||||
|
||||
template class LearnableParameter<float>;
|
||||
template class LearnableParameter<double>;
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// -----------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
|
||||
class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>, public IFreezable
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName() { return L"LearnableParameter"; }
|
||||
|
@ -29,69 +29,57 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
|
|||
void InitShape(const TensorShape& shape);
|
||||
|
||||
public:
|
||||
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: Base(deviceId, name)
|
||||
// this constructor is always run (all other constructors call this one)
|
||||
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name) :
|
||||
Base(deviceId, name)
|
||||
{
|
||||
SetLearningRateMultiplier(1.0f); // enable normal learning by default
|
||||
MarkValueNonSharable();
|
||||
m_initString = L"fromValue"; // default init is with 0; typically overwritten
|
||||
m_initValue = 0;
|
||||
}
|
||||
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& shape)
|
||||
: Base(deviceId, name)
|
||||
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& shape) :
|
||||
LearnableParameter(deviceId, name)
|
||||
{
|
||||
SetLearningRateMultiplier(1.0f);
|
||||
MarkValueNonSharable();
|
||||
InitShape(shape);
|
||||
LazyInitParameters();
|
||||
}
|
||||
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, size_t rows, size_t cols)
|
||||
: LearnableParameter(deviceId, name, TensorShape(rows, cols))
|
||||
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, size_t rows, size_t cols) :
|
||||
LearnableParameter(deviceId, name, TensorShape(rows, cols))
|
||||
{
|
||||
}
|
||||
LearnableParameter(const ScriptableObjects::IConfigRecordPtr configp);
|
||||
|
||||
// initialize with random numbers
|
||||
// if 'initOnCPUOnly' then always init on CPU, making initialization consistent across both (for testing)
|
||||
void InitRandom(const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly);
|
||||
// initialize after plain constructor; for use by NDL
|
||||
void PostInitParameters(const std::wstring& initString, // "uniform"|"gaussian"|"fixedValue"
|
||||
ElemType initValue, // scale | scale | value
|
||||
unsigned long randomSeed = 0,
|
||||
bool initOnCPUOnly = false);
|
||||
|
||||
// initialize by reading a matrix from a text file
|
||||
void InitFromFile(const std::wstring& initFromFilePath);
|
||||
|
||||
private:
|
||||
// initialize with random numbers
|
||||
// If 'initOnCPUOnly' then always init on CPU, making initialization consistent across both (for testing).
|
||||
void InitRandom(const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly);
|
||||
|
||||
// helper to initialize from a matrix read from a text file or a string literal
|
||||
void InitFromArray(const std::vector<ElemType>& array, size_t numRows, size_t numCols);
|
||||
|
||||
// deferred initialization
|
||||
void LazyInitParameters();
|
||||
|
||||
public:
|
||||
// reload parameters from file
|
||||
// This is called from MEL.
|
||||
// TODO: Move this error check there, since this is called only from one place.
|
||||
void ReviseFromFile(const std::wstring& reviseFromFilePath)
|
||||
{
|
||||
#if 1
|
||||
try
|
||||
{
|
||||
InitFromFile(reviseFromFilePath);
|
||||
}
|
||||
catch(const std::exception & e)
|
||||
{
|
||||
RuntimeError("ReviseFromFile: Failed to reload %ls %ls operation from file %ls: %s", NodeName().c_str(), OperationName().c_str(), reviseFromFilePath.c_str(), e.what());
|
||||
}
|
||||
#else
|
||||
size_t numRows, numCols;
|
||||
auto array = File::LoadMatrixFromTextFile<ElemType>(reviseFromFilePath, numRows, numCols);
|
||||
size_t nRows, nCols;
|
||||
DetermineDataSize(nRows, nCols); // BUGBUG: private
|
||||
|
||||
if (numRows != nRows || numCols != nCols)
|
||||
{
|
||||
RuntimeError("Error in ReviseFromFile for node %ls using file %ls: original size (%d x %d) vs current size (%d x %d)",
|
||||
m_nodeName.c_str(), reviseFromFilePath.c_str(), (int) nRows, (int) nCols, (int) numRows, (int) numCols);
|
||||
}
|
||||
|
||||
Value().SetValue(numRows, numCols, m_deviceId, array.data(), matrixFlagNormal);
|
||||
VerifyDataSize(Value()); // sanity check
|
||||
#endif
|
||||
}
|
||||
void ReviseFromFile(const std::wstring& reviseFromFilePath);
|
||||
|
||||
virtual void Save(File& fstream) const override;
|
||||
virtual void Load(File& fstream, size_t modelVersion) override;
|
||||
|
||||
virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override;
|
||||
|
||||
// computation functions don't do anything for parameter nodes
|
||||
virtual void UpdateFunctionMBSize() override;
|
||||
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange&) override;
|
||||
|
@ -106,6 +94,17 @@ public:
|
|||
void InferInputDimsFrom(const TensorShape& otherShape);
|
||||
|
||||
virtual void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override;
|
||||
|
||||
// called from CloneFunction(..., parameters="constant")
|
||||
virtual void FreezeParameters() override; // from IFreezable
|
||||
|
||||
private:
|
||||
// init parameters for deferred initialization (which happens in Validate())
|
||||
std::wstring m_initString; // if non-empty then deferred initialization is needed. Gets cleared upon completion of deferred init.
|
||||
unsigned long m_randomSeed;
|
||||
ElemType m_initValueScale;
|
||||
bool m_initOnCPUOnly;
|
||||
ElemType m_initValue;
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -162,7 +161,7 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>, pu
|
|||
typedef ComputationNode<ElemType> Base;
|
||||
UsingComputationNodeMembers;
|
||||
|
||||
void Init(const TensorShape& sampleLayout, bool isSparse, const std::wstring axisName)
|
||||
void Init(const TensorShape& sampleLayout, bool isSparse, const std::wstring axisName, float learningRateMultiplier = 0)
|
||||
{
|
||||
m_isSparse = isSparse;
|
||||
MarkValueNonSharable();
|
||||
|
@ -171,7 +170,7 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>, pu
|
|||
|
||||
SetDims(sampleLayout, HasMBLayout()); // also called when reloading a file. Then we have an MBLayout, otherwise not yet
|
||||
UpdateFunctionValuesSize(); // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
|
||||
SetLearningRateMultiplier(0);
|
||||
SetLearningRateMultiplier(learningRateMultiplier);
|
||||
m_dynamicAxisNodeName = axisName;
|
||||
}
|
||||
|
||||
|
@ -225,9 +224,9 @@ protected:
|
|||
Init(ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))), isSparse, axisName);
|
||||
}
|
||||
|
||||
public:
|
||||
virtual const std::wstring GetRequestedDynamicAxis() const { return m_dynamicAxisNodeName; }
|
||||
|
||||
public:
|
||||
virtual void Save(File& fstream) const override
|
||||
{
|
||||
Base::Save(fstream);
|
||||
|
@ -239,6 +238,8 @@ public:
|
|||
unsigned int nrAxes = 1;
|
||||
fstream << nrAxes;
|
||||
fstream << m_dynamicAxisNodeName;
|
||||
|
||||
fstream << m_learningRateMultiplier;
|
||||
}
|
||||
|
||||
virtual void Load(File& fstream, size_t modelVersion) override
|
||||
|
@ -268,7 +269,12 @@ public:
|
|||
}
|
||||
else
|
||||
m_dynamicAxisNodeName = L""; // Use default
|
||||
Init(sampleLayout, m_isSparse, m_dynamicAxisNodeName);
|
||||
|
||||
float learningRateMultiplier = 0;
|
||||
if (modelVersion >= CNTK_MODEL_VERSION_10)
|
||||
fstream >> learningRateMultiplier;
|
||||
|
||||
Init(sampleLayout, m_isSparse, m_dynamicAxisNodeName, learningRateMultiplier);
|
||||
}
|
||||
|
||||
// InputValue must not resize its inputs because that might destroy it. It should already have the correct size.
|
||||
|
|
|
@ -463,6 +463,8 @@ public:
|
|||
Base::AllocateGradientMatricesForInputs(matrixPool);
|
||||
}
|
||||
|
||||
size_t OutputRank() const { return m_outputRank; }
|
||||
|
||||
private:
|
||||
size_t m_outputRank;
|
||||
};
|
||||
|
|
|
@ -376,117 +376,12 @@ private:
|
|||
template class InvStdDevNode<float>;
|
||||
template class InvStdDevNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// PerDimMeanVarNormalizationNode (feature, mean, invStdDev)
|
||||
// Computes
|
||||
// output = (feature - mean) .* invStdDev
|
||||
// where mean and invStdDev are meant to be single elements while features
|
||||
// is minibatch data.
|
||||
// TODO: Why do we need this? Why not use Plus and ElementTimes?
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
class PerDimMeanVarNormalizationNode : public ComputationNode<ElemType>, public NumInputs<3>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base;
|
||||
UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName()
|
||||
{
|
||||
return L"PerDimMeanVarNormalization";
|
||||
}
|
||||
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(PerDimMeanVarNormalizationNode);
|
||||
PerDimMeanVarNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: Base(deviceId, name)
|
||||
{
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
|
||||
{
|
||||
InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
|
||||
{
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto output = ValueTensorFor(rank, fr);
|
||||
auto input = Input(0)->ValueTensorFor(rank, fr);
|
||||
auto mean = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
auto invStdDev = Input(2)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
|
||||
output.AssignDifferenceOf(input, mean); // output = input - mean
|
||||
output.AssignElementwiseProductOf(output, invStdDev); // output *= invStdDev
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
|
||||
|
||||
Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
|
||||
Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
|
||||
|
||||
|
||||
#if 1
|
||||
// support for legacy models when the mean and variance vectors were stored as column vectors (N,1)
|
||||
// This code will copy the shape of Input(0) (source) to Input(1) and Input(2) (target) if:
|
||||
// 1. The source is a 3-tensor with shape 1x1xM
|
||||
// 2. The target is a vector (i.e., a 2-tensor with shape Nx1)
|
||||
// 3. Both targets have the same number of elements
|
||||
// 4. The number of elements in the target (N) is the same as the number of elements in the source (M)
|
||||
// Note: This is somewhat ugly [Jasha Droppo].
|
||||
|
||||
auto dimsA = Input(0)->GetSampleLayout().GetDims();
|
||||
auto dimsB = Input(1)->GetSampleLayout().GetDims();
|
||||
auto dimsC = Input(2)->GetSampleLayout().GetDims();
|
||||
|
||||
if (
|
||||
// Test condition 1.
|
||||
(dimsA.size() == 3 && dimsA[0] == 1 && dimsA[1] == 1) &&
|
||||
// Test condition 2.
|
||||
(dimsB.size() == 2 && dimsB[1] == 1) &&
|
||||
(dimsC.size() == 2 && dimsC[1] == 1) &&
|
||||
// Test condition 3. and condition 4.
|
||||
(dimsB[0] == dimsC[0] && dimsB[0] == dimsA[2])
|
||||
)
|
||||
{
|
||||
// for error messages
|
||||
string dimsBstring = string(Input(1)->GetSampleLayout());
|
||||
string dimsCstring = string(Input(2)->GetSampleLayout());
|
||||
|
||||
// reshape Input(1)
|
||||
Input(1)->SetDims(TensorShape(dimsA), false);
|
||||
fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of second input (%ls %ls operation) was patched to [%s] (from [%s])\n",
|
||||
NodeName().c_str(), OperationName().c_str(), Input(1)->NodeName().c_str(), Input(1)->OperationName().c_str(), string(Input(1)->GetSampleLayout()).c_str(), dimsBstring.c_str());
|
||||
|
||||
// reshape Input(2)
|
||||
Input(2)->SetDims(TensorShape(dimsA), false);
|
||||
fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of third input (%ls %ls operation) was patched to [%s] (from [%s])\n",
|
||||
NodeName().c_str(), OperationName().c_str(), Input(2)->NodeName().c_str(), Input(2)->OperationName().c_str(), string(Input(2)->GetSampleLayout()).c_str(), dimsCstring.c_str());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if (isFinalValidationPass)
|
||||
{
|
||||
if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
|
||||
InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
|
||||
}
|
||||
|
||||
SetDims(Input(0));
|
||||
}
|
||||
};
|
||||
|
||||
template class PerDimMeanVarNormalizationNode<float>;
|
||||
template class PerDimMeanVarNormalizationNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// PerDimMeanVarDeNormalizationNode (feature, mean, invStdDev)
|
||||
// Computes
|
||||
// output = feature ./ invStdDev + mean
|
||||
// with parameters the same as PerDimMeanVarNormalizationNode.
|
||||
// TODO: Why do we need this? Why not use Plus and ElementDividedBy?
|
||||
// TODO: Deprecate like PerDimMeanVarNormalizationNode as soon as we have a test case. Or just delete it.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
|
|
|
@ -464,6 +464,9 @@ public:
|
|||
LogicError("Unrecognized direction in DelayedValueNodeBase");
|
||||
}
|
||||
|
||||
int TimeStep() const { return m_timeStep; }
|
||||
ElemType InitialActivationValue() const { return m_initialActivationValue; }
|
||||
|
||||
protected:
|
||||
ElemType m_initialActivationValue; // starting value for hidden activation vector at boundary
|
||||
Matrix<ElemType> m_delayedValue; // saves the activation of the previous step that this node points to
|
||||
|
|
|
@ -34,9 +34,9 @@ template <class ElemType>
|
|||
if (flags & CopyNodeFlags::copyNodeValue)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<ReduceElementsNode<ElemType>>(nodeP);
|
||||
node->m_axis = m_axis;
|
||||
node->m_operation = m_operation;
|
||||
node->m_op = m_op;
|
||||
node->m_axis = m_axis;
|
||||
node->m_operation = m_operation;
|
||||
node->m_reductionOp = m_reductionOp;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -64,7 +64,7 @@ template <class ElemType>
|
|||
auto input = Input(0)->ValueTensorFor(rank, fr);
|
||||
|
||||
// the actual operation is a Copy with reduction, where the magic is in the reduction op
|
||||
result.DoUnaryOpOf(0, input, 1, ElementWiseOperator::opCopy, m_op);
|
||||
result.DoUnaryOpOf(0, input, 1, ElementWiseOperator::opCopy, m_reductionOp);
|
||||
// note: we can implement "Mean" by passing 1/dim for alpha
|
||||
}
|
||||
|
||||
|
@ -79,13 +79,46 @@ template <class ElemType>
|
|||
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
|
||||
|
||||
// gradients are not as simple as passing an op-code, unfortunately
|
||||
switch (m_op)
|
||||
switch (m_reductionOp)
|
||||
{
|
||||
case ElementWiseOperator::opSum:
|
||||
// "Sum": broadcast the gradient
|
||||
sliceInputGrad.AddCopyOf(sliceOutputGrad);
|
||||
break;
|
||||
|
||||
case ElementWiseOperator::opLogSum:
|
||||
{
|
||||
auto input = Input(inputIndex)->ValueTensorFor(rank, fr);
|
||||
auto output = ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
// Let: f(x, y, z) = log(exp x + exp y + exp z)
|
||||
// For the derivative we get:
|
||||
// df / dx = exp(x)/exp(f)
|
||||
// = exp(x – f)
|
||||
sliceInputGrad.AddElementwiseProductWithExpOfDiffOf(sliceOutputGrad, input, output);
|
||||
}
|
||||
break;
|
||||
|
||||
case ElementWiseOperator::opMin:
|
||||
case ElementWiseOperator::opMax:
|
||||
auto input = Input(inputIndex)->ValueTensorFor(rank, fr);
|
||||
auto output = ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
|
||||
// POTENTIAL PROBLEM:
|
||||
// For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points.
|
||||
// E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time.
|
||||
// In these cases there is no correct gradient.The question is if this could lead to any problems.
|
||||
// Let's look at two scenarios where this might happen:
|
||||
//
|
||||
// * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value.
|
||||
// In this case it's not a problem that the input gradient is kind of bad as the derivative of the concerning input nodes will be zero anyway.
|
||||
//
|
||||
// * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as we wouldn't wan't to propagate gradients to the training data.
|
||||
//
|
||||
// So as we don't have a better solution yet and it probably doesn't have impact let's stay with the current solution.
|
||||
// Also note that for Clip , Min, Max and ReLU we have the same kind of problem.
|
||||
sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad);
|
||||
break;
|
||||
|
||||
// more coming
|
||||
|
||||
// "LogPlus": softmax
|
||||
|
@ -93,18 +126,18 @@ template <class ElemType>
|
|||
// df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i - ReduceLogPlus(x))
|
||||
// targetGradient = gradientFromTop .* Exp (inputValue - outputValue) --TODO: verify
|
||||
// i.e. compute dfference if input and output, then Exp in-place. No, would need temp memory. So needs its own opcode AddScaledExpOfDiff(). Ternary.
|
||||
|
||||
// "Max": Copy the gradient only to the max value. targetGradient += gradientFromTop .* (outputValue == inputValue). Needs its own opcode. --TODO : verify
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
/*virtual*/ bool ReduceElementsNode<ElemType>::OutputUsedInComputingInputNodesGradients() const /*override*/
|
||||
{
|
||||
switch (m_op)
|
||||
switch (m_reductionOp)
|
||||
{
|
||||
case ElementWiseOperator::opSum: return false;
|
||||
// will be different e.g. for LogPlus, Max, and Min
|
||||
case ElementWiseOperator::opSum: return false;
|
||||
case ElementWiseOperator::opLogSum: return true;
|
||||
case ElementWiseOperator::opMin: return true;
|
||||
case ElementWiseOperator::opMax: return true;
|
||||
}
|
||||
LogicError("Should not get here.");
|
||||
}
|
||||
|
@ -112,25 +145,31 @@ template <class ElemType>
|
|||
template <class ElemType>
|
||||
/*virtual*/ bool ReduceElementsNode<ElemType>::InputUsedInComputingInputNodesGradients(size_t inputIndex) const /*override*/
|
||||
{
|
||||
switch (m_op)
|
||||
switch (m_reductionOp)
|
||||
{
|
||||
case ElementWiseOperator::opSum: return false;
|
||||
// will be different for LogPlus, Max, and Min
|
||||
case ElementWiseOperator::opSum: return false;
|
||||
case ElementWiseOperator::opLogSum: return true;
|
||||
case ElementWiseOperator::opMin: return true;
|
||||
case ElementWiseOperator::opMax: return true;
|
||||
}
|
||||
LogicError("Should not get here.");
|
||||
}
|
||||
|
||||
// map the operation specific as a string to an ElementWiseOperator to pass to
|
||||
// map the operation specified as a string to an ElementWiseOperator value.
|
||||
template <class ElemType>
|
||||
void ReduceElementsNode<ElemType>::ValidateOp()
|
||||
{
|
||||
#if 1 // legacy with initial experiments, delete this soon
|
||||
if (m_operation == L"Plus") m_op = ElementWiseOperator::opSum;
|
||||
if (m_operation == L"Plus") m_reductionOp = ElementWiseOperator::opSum;
|
||||
else
|
||||
#endif
|
||||
if (m_operation == L"Sum") m_op = ElementWiseOperator::opSum;
|
||||
if (m_operation == L"Sum") m_reductionOp = ElementWiseOperator::opSum;
|
||||
else if (m_operation == L"LogSum") m_reductionOp = ElementWiseOperator::opLogSum;
|
||||
else if (m_operation == L"Min") m_reductionOp = ElementWiseOperator::opMin;
|
||||
else if (m_operation == L"Max") m_reductionOp = ElementWiseOperator::opMax;
|
||||
|
||||
// more here
|
||||
else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Sum'. And a few more soon.", NodeDescription().c_str(), m_operation.c_str());
|
||||
else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Sum', 'Max', 'Min'.", NodeDescription().c_str(), m_operation.c_str());
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
|
|
@ -196,7 +196,7 @@ class ReduceElementsNode : public ComputationNode<ElemType>, public NumInputs<1>
|
|||
void ValidateOp();
|
||||
public:
|
||||
ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation = std::wstring(), int axis = 0) :
|
||||
Base(deviceId, name), m_operation(operation), m_axis(axis), m_op((ElementWiseOperator)-1/*invalid*/)
|
||||
Base(deviceId, name), m_operation(operation), m_axis(axis), m_reductionOp((ElementWiseOperator)-1/*invalid*/)
|
||||
{
|
||||
if (!m_operation.empty()) // verify validity already here out of courtesy (would otherwise be caught in Validate())
|
||||
ValidateOp();
|
||||
|
@ -220,7 +220,7 @@ public:
|
|||
private:
|
||||
int m_axis;
|
||||
std::wstring m_operation; // the operation as a string, e.g. "Sum", see ValidateOp()
|
||||
ElementWiseOperator m_op; // the operation mapped to our internal opCode
|
||||
ElementWiseOperator m_reductionOp; // the reduction operation mapped to our internal opCode
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
|
|
@ -28,7 +28,7 @@ TraceNode<ElemType>::TraceNode(const ScriptableObjects::IConfigRecordPtr configp
|
|||
m_message = (const std::wstring&)configp->Get(L"say");
|
||||
m_logFirst = configp->Get(L"logFirst");
|
||||
m_logFrequency = configp->Get(L"logFrequency");
|
||||
m_logGradientToo = false; // configp->Get(L"logGradientToo"); not yet implemented
|
||||
m_logGradientToo = configp->Get(L"logGradientToo");
|
||||
m_formattingOptions = WriteFormattingOptions(*configp);
|
||||
m_onlyUpToRow = configp->Get(L"onlyUpToRow");
|
||||
m_onlyUpToT = configp->Get(L"onlyUpToT");
|
||||
|
@ -75,7 +75,31 @@ template <class ElemType>
|
|||
auto result = ValueTensorFor(rank, fr);
|
||||
auto input = Input(0)->ValueTensorFor(rank, fr);
|
||||
result.AssignCopyOf(input);
|
||||
// log the content
|
||||
|
||||
// do the tracing
|
||||
Log(fr, false/*means log value*/);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
/*virtual*/ void TraceNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
|
||||
{
|
||||
assert(inputIndex == 0); inputIndex;
|
||||
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one...
|
||||
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
|
||||
|
||||
sliceInputGrad.AddCopyOf(sliceOutputGrad);
|
||||
|
||||
// do the tracing
|
||||
if (m_logGradientToo)
|
||||
Log(fr, true/*means log gradient*/);
|
||||
}
|
||||
|
||||
// log value or gradient
|
||||
template <class ElemType>
|
||||
/*virtual*/ void TraceNode<ElemType>::Log(const FrameRange& fr, bool logGradientInstead) const
|
||||
{
|
||||
if (m_numMBsRun == 1)
|
||||
{
|
||||
const auto prologue = m_formattingOptions.Processed(NodeName(), m_formattingOptions.prologue, m_numMBsRun);
|
||||
|
@ -94,30 +118,18 @@ template <class ElemType>
|
|||
let timeRange = fr.GetTimeRange();
|
||||
fprintf(stderr, "------- Trace["); // --- for better visual separability from actual content
|
||||
if (fr.IsAllFrames())
|
||||
fprintf(stderr, "*");
|
||||
else if (timeRange.second == timeRange.first+1)
|
||||
fprintf(stderr, "%d", (int)timeRange.first);
|
||||
;
|
||||
else if (timeRange.second == timeRange.first + 1)
|
||||
fprintf(stderr, "%d", (int)timeRange.first);
|
||||
else if (timeRange.second > timeRange.first + 1)
|
||||
fprintf(stderr, "%d..%d", (int)timeRange.first, (int)timeRange.second-1);
|
||||
fprintf(stderr, "] %ls --> %s\n", m_message.c_str(), Input(0)->FormatOperationPrototype("").c_str());
|
||||
fprintf(stderr, "] %ls %s--> %s\n", m_message.c_str(), logGradientInstead ? "(gradient) " : "", Input(0)->FormatOperationPrototype("").c_str());
|
||||
Input(0)->WriteMinibatchWithFormatting(stderr, fr, m_onlyUpToRow, m_onlyUpToT, m_formattingOptions.transpose, m_formattingOptions.isCategoryLabel, m_formattingOptions.isSparse, m_labelMapping,
|
||||
sequenceSeparator, sequencePrologue, sequenceEpilogue, elementSeparator, sampleSeparator,
|
||||
valueFormatString, /*outputGradient=*/false);
|
||||
valueFormatString, logGradientInstead);
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
/*virtual*/ void TraceNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
|
||||
{
|
||||
assert(inputIndex == 0); inputIndex;
|
||||
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one...
|
||||
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
|
||||
|
||||
sliceInputGrad.AddCopyOf(sliceOutputGrad);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
/*virtual*/ void TraceNode<ElemType>::Validate(bool isFinalValidationPass) // override
|
||||
{
|
||||
|
|
|
@ -47,6 +47,9 @@ public:
|
|||
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
|
||||
|
||||
private:
|
||||
void Log(const FrameRange& fr, bool logGradientInstead) const;
|
||||
|
||||
private:
|
||||
// configuration
|
||||
std::wstring m_message;
|
||||
|
|
|
@ -1534,8 +1534,8 @@ template class DropoutNode<float>;
|
|||
template class DropoutNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// BatchNormalizationNode (input, scale, bias, runMean, runInvStdDev, spatial,
|
||||
// normalizationTimeConstant = 0, blendTimeConstant = 0,
|
||||
// BatchNormalizationNode (input, scale, bias, runMean, runInvStdDev,
|
||||
// spatial, normalizationTimeConstant = 0, blendTimeConstant = 0,
|
||||
// epsilon = 0.00001,
|
||||
// useCntkEngine = true, imageLayout = 'cudnn')
|
||||
//
|
||||
|
@ -1553,51 +1553,48 @@ template class DropoutNode<double>;
|
|||
// where gamma and beta are trainable parameters(represented as LearnableParameter).
|
||||
//
|
||||
// * input is the input of the batch normalization node
|
||||
// * scale is a LearnableParameter that stores scale vector(gamma term in the equation above).
|
||||
// * bias is a LearnableParameter that stores bias vector(beta term). scale and bias must have the same dimensions which must be equal
|
||||
// * scale is a LearnableParameter that stores scale vector (gamma term in the equation above).
|
||||
// * bias is a LearnableParameter that stores bias vector (beta term). scale and bias must have the same dimensions which must be equal
|
||||
// to the input dimensions in case of spatial = false or number of output convolution feature maps in case of spatial = true.
|
||||
// * runMean is the running mean which is used during evaluation phase and might be used during training as well.
|
||||
// It is represented as a LearnableParameter with the same dimensions as scale and bias.
|
||||
// * runInvStdDev is the running inverse square root of variance(so InvStdDev = 1 / sqrt(var + epsilon)).
|
||||
// It is represented as a LearnableParameter with the same dimensions as scale and bias.
|
||||
// * spatial is a flag that specifies whether to compute mean / var for each feature in a mininbatch independently or, in case of convolutional layers, per feature map.
|
||||
// TODO: This must be configured in a generic fashion where tensor axes are chosen along which parameters are tied.
|
||||
// * normalizationTimeConstant is the time constant which is used to compute running average of mean and variance.
|
||||
// Value 0 (default) means there will be no exponential smoothing and running mean / variance will always have values computed for the last seen mininbatch.
|
||||
// Value 1#INF (infinity)means running values are "frozen" (i.e.will not be updated).
|
||||
// Value 0 (default) means there will be no exponential smoothing and running mean/variance will always have values computed for the last seen mininbatch.
|
||||
// Value 1#INF (infinity) means running values are "frozen" (i.e.will not be updated).
|
||||
// * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch.
|
||||
// Value 0 (default) means no blending will happen and only the current minibatch statistics will be used.
|
||||
// Value 1#INF (infinity)means only running mean / var will be used(this is used, for example, in evaluation phase).
|
||||
// Value 1#INF (infinity) means only running mean / var will be used(this is used, for example, in evaluation phase).
|
||||
// * epsilon is a conditioner constant used in computing InvStdDev
|
||||
// * useCntkEngine is a boolean flag that specifies which batch normalization implementation to use : CNTK or cuDNN - based.
|
||||
// * imageLayout is the image layout.Only cudnn is supported.
|
||||
// * useCntkEngine is a boolean flag that specifies which batch normalization implementation to use : CNTK or cuDNN-based.
|
||||
// * imageLayout is the image layout. Only cudnn is supported at present.
|
||||
// -----------------------------------------------------------------------
|
||||
template <class ElemType>
|
||||
class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInputs<5>
|
||||
class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<5>, public IFreezable
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base;
|
||||
UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName()
|
||||
{
|
||||
return L"BatchNormalization";
|
||||
}
|
||||
typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName() { return L"BatchNormalization"; }
|
||||
|
||||
public:
|
||||
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
|
||||
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name) :
|
||||
Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
|
||||
m_mbCount(0), m_imageLayoutKind(ImageLayoutKind::CHW)
|
||||
{
|
||||
}
|
||||
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant,
|
||||
double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind)
|
||||
: Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
|
||||
m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
|
||||
double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind) :
|
||||
Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
|
||||
m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
|
||||
{
|
||||
}
|
||||
BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp)
|
||||
: BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
|
||||
configp->Get(L"normalizationTimeConstant"), configp->Get(L"blendTimeConstant"),
|
||||
configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
|
||||
ImageLayoutKindFrom(configp->Get(L"imageLayout")))
|
||||
BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
|
||||
configp->Get(L"normalizationTimeConstant"), configp->Get(L"blendTimeConstant"),
|
||||
configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
|
||||
ImageLayoutKindFrom(configp->Get(L"imageLayout")))
|
||||
{
|
||||
AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
|
||||
}
|
||||
|
@ -1689,46 +1686,110 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
|
||||
private: // time-constant conversions
|
||||
|
||||
// map time constants to exp avg factor
|
||||
// This is the factor for the current MB's estimate (1-factor is used for the previous value of the running stats).
|
||||
double ComputeExpAvgFactor() const
|
||||
{
|
||||
// in inference mode, only use long-term mean and do not update running estimates
|
||||
if (!Environment().IsTraining())
|
||||
return 0; // (m_normTimeConst == infinity) no new contribution from current minibatch
|
||||
|
||||
// REVIEW alexeyk: hack, m_normTimeConst < 0 is used to denote corpus-level statistics (without forgetting factor).
|
||||
if (m_normTimeConst < 0)
|
||||
return 1.0 / (1.0 + m_mbCount); // (this is the hack case) TODO: verify this formula; shouldn't we use #samples instead of MB count?
|
||||
|
||||
// Convert to per-minibatch factor. The limit, positivie infinity, means that running mean/var parameters are "frozen"
|
||||
// that is, do not require updates.
|
||||
// The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
|
||||
double numSamples = (double)GetMBLayout()->GetActualNumSamples();
|
||||
if (!isfinite(m_normTimeConst)) // infinite
|
||||
return 0; // no new contribution from current minibatch (infinitely long memory)
|
||||
else if (m_normTimeConst > 0) // not zero
|
||||
return 1.0 - exp(-numSamples / m_normTimeConst); // interpolate expAvgFactor * MB stats + (1-expAvgFactor) * prev running stats
|
||||
else // zero
|
||||
return 1.0; // don't use running stats at all
|
||||
}
|
||||
|
||||
// map sample count to blend factor
|
||||
// This is the interpolation weight for the running statistics (the current MB statistics are weighted with 1-this).
|
||||
double ComputeBlendFactor() const
|
||||
{
|
||||
// in inference mode, only use long-term mean and do not update running estimates
|
||||
if (!Environment().IsTraining())
|
||||
return 1.0; // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate
|
||||
|
||||
// convert to blend factor (= weight for running stats)
|
||||
// The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
|
||||
double numSamples = (double)GetMBLayout()->GetActualNumSamples();
|
||||
if (!isfinite(m_blendTimeConst)) // infinite weight for prior stats
|
||||
return 1.0; // only use running statistics
|
||||
else if (m_blendTimeConst > 0) // not zero
|
||||
return m_blendTimeConst / (m_blendTimeConst + numSamples); // interpolate blendFactor * running stats + (1-blendFactor) * MB stats
|
||||
else // zero
|
||||
return 0; // no weight for prior stats, only use MB stats
|
||||
}
|
||||
public:
|
||||
|
||||
// Note: This function assumes that inputIndex=0 is called before the others.
|
||||
// BUGBUG: The node should not make assumptions in which order the inputs' derivates are computed. It currently assumes to start with 0.
|
||||
// BUGBUG: If the input has no learnables (e.g. using BN instead of corpus mean/var norm), this will not be called for inputIndex=0 at all.
|
||||
virtual void BackpropToNonLooping(size_t inputIndex) override
|
||||
{
|
||||
FrameRange fr(Input(0)->GetMBLayout());
|
||||
|
||||
if (inputIndex == 0) // derivative with respect to the input.
|
||||
{
|
||||
auto sliceOutputGrad = GradientFor(fr);
|
||||
auto sliceInputValue = Input(0)->ValueFor(fr);
|
||||
const Matrix<ElemType>& scale = Input(1)->Value();
|
||||
const Matrix<ElemType>& bias = Input(2)->Value();
|
||||
const Matrix<ElemType>& runMean = Input(3)->Value();
|
||||
const Matrix<ElemType>& runInvStdDev = Input(4)->Value();
|
||||
|
||||
auto sliceInputGrad = Input(0)->GradientFor(fr);
|
||||
m_dScale->Resize(scale);
|
||||
// The mean used in Forward() are either saveMean or runMean.
|
||||
// This is decided by the engine, which communicates back the decision by returning
|
||||
// an empty saveMean in case runMean should be used. Likewise for stddev.
|
||||
let& actualMean = !m_saveMean->IsEmpty() ? *m_saveMean : runMean; // empty if only the running mean is used
|
||||
let& actualInvStdDev = !m_saveInvStdDev->IsEmpty() ? *m_saveInvStdDev : runInvStdDev;
|
||||
m_dScale->Resize(scale); // gradients for scale and bias get stored here
|
||||
m_dBias->Resize(bias);
|
||||
|
||||
double blendFactor = ComputeBlendFactor(); // interpolation weight for the running statistics (the current MB statistics are weighted with 1-this)
|
||||
|
||||
// Compute all derivatives in one step. Save derivatives with respect to scale and bias in temp matrices.
|
||||
m_bnEng->Backward(sliceInputValue, sliceOutputGrad, sliceInputGrad, scale,
|
||||
*m_saveMean, *m_saveInvStdDev, *m_dScale, *m_dBias);
|
||||
m_bnEng->Backward(sliceInputValue, sliceOutputGrad, // (in) input from below, gradient from above
|
||||
sliceInputGrad, // (out) gradient for data input goes here
|
||||
scale, // (in) out of scale and bias, only scale is needed in gradient propagation
|
||||
blendFactor, // (in) smoothing weight for running stats (1=use only running stats)
|
||||
actualMean, actualInvStdDev, // (in) actual mean/stddev values used in ForwardProp()
|
||||
*m_dScale, *m_dBias); // (out) gradients for scale and bias
|
||||
}
|
||||
else if (inputIndex == 1) // derivative with respect to the scale
|
||||
{
|
||||
// Derivative with respect to the scale was precomputed during input derivative computation.
|
||||
Matrix<ElemType>& grad = Input(1)->Gradient();
|
||||
grad.SetValue(grad.GetNumRows(), grad.GetNumCols(), grad.GetDeviceId(), m_dScale->Data());
|
||||
// BUGBUG: ^^ This should add the gradient, not overwrite it.
|
||||
}
|
||||
else if (inputIndex == 2) // derivative with respect to the bias
|
||||
{
|
||||
// Derivative with respect to the bias was precomputed during input derivative computation.
|
||||
Matrix<ElemType>& grad = Input(2)->Gradient();
|
||||
grad.SetValue(grad.GetNumRows(), grad.GetNumCols(), grad.GetDeviceId(), m_dBias->Data());
|
||||
// BUGBUG: ^^ Also here, this should add the gradient, not overwrite it.
|
||||
}
|
||||
// No derivatives with respect to running mean and InvStdDev.
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The BatchNormalizationNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
||||
|
||||
void ForwardProp(const FrameRange& fr) override
|
||||
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
|
||||
{
|
||||
FrameRange fr(Input(0)->GetMBLayout());
|
||||
|
||||
Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
|
||||
|
||||
const Matrix<ElemType>& scale = Input(1)->Value();
|
||||
|
@ -1744,42 +1805,16 @@ public:
|
|||
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
double expAvgFactor;
|
||||
double blendFactor;
|
||||
if (!Environment().IsTraining())
|
||||
{
|
||||
expAvgFactor = 0;
|
||||
blendFactor = 1.0;
|
||||
// determine the factors from the time constants
|
||||
double expAvgFactor = ComputeExpAvgFactor(); // weight for the new MB statistics in the running estimate. The previous value of the running statistics is kept with weight (1-this)
|
||||
double blendFactor = ComputeBlendFactor(); // interpolation weight for the running statistics (the current MB statistics are weighted with 1-this)
|
||||
|
||||
m_saveMean->Resize(0, 0);
|
||||
m_saveInvStdDev->Resize(0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
double numSamples = (double)GetMBLayout()->GetActualNumSamples();
|
||||
if (m_normTimeConst > 0)
|
||||
{
|
||||
// Convert to per-minibatch factor. Treat positivie infinity as if running mean/var parameters are "frozen"
|
||||
// that is, do not require updates.
|
||||
expAvgFactor = !isfinite(m_normTimeConst) ? 0 : (1.0 - exp(-numSamples / m_normTimeConst));
|
||||
}
|
||||
else
|
||||
{
|
||||
// REVIEW alexeyk: hack, m_normTimeConst < 0 is used to compute CMA.
|
||||
expAvgFactor = (m_normTimeConst < 0) ? (1.0 / (1.0 + m_mbCount)) : 1.0;
|
||||
}
|
||||
|
||||
if (!isfinite(m_blendTimeConst))
|
||||
blendFactor = 1.0;
|
||||
else
|
||||
blendFactor = m_blendTimeConst > 0 ? (m_blendTimeConst / (m_blendTimeConst + numSamples)) : 0;
|
||||
|
||||
m_saveMean->Resize(runMean);
|
||||
m_saveInvStdDev->Resize(runMean);
|
||||
}
|
||||
|
||||
m_bnEng->Forward(sliceInputValue, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev,
|
||||
sliceOutputValue, m_epsilon, *m_saveMean, *m_saveInvStdDev);
|
||||
m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias, // (in)
|
||||
expAvgFactor, blendFactor,
|
||||
runMean, runInvStdDev, // (in/out) running estimates, updated from the current MB mean/stddev
|
||||
/*out=*/ sliceOutputValue, // (out) batch-normalized output value
|
||||
m_epsilon,
|
||||
*m_saveMean, *m_saveInvStdDev); // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine
|
||||
|
||||
m_mbCount++;
|
||||
}
|
||||
|
@ -1820,25 +1855,25 @@ public:
|
|||
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
||||
RequestMatrixFromPool(m_saveMean, matrixPool);
|
||||
RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
|
||||
}
|
||||
RequestMatrixFromPool(m_saveMean, matrixPool);
|
||||
RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
|
||||
}
|
||||
|
||||
void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::RequestMatricesBeforeBackprop(matrixPool);
|
||||
RequestMatrixFromPool(m_dScale, matrixPool);
|
||||
RequestMatrixFromPool(m_dBias, matrixPool);
|
||||
}
|
||||
RequestMatrixFromPool(m_dScale, matrixPool);
|
||||
RequestMatrixFromPool(m_dBias, matrixPool);
|
||||
}
|
||||
|
||||
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_saveMean, matrixPool);
|
||||
ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
|
||||
ReleaseMatrixToPool(m_dScale, matrixPool);
|
||||
ReleaseMatrixToPool(m_dBias, matrixPool);
|
||||
}
|
||||
ReleaseMatrixToPool(m_saveMean, matrixPool);
|
||||
ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
|
||||
ReleaseMatrixToPool(m_dScale, matrixPool);
|
||||
ReleaseMatrixToPool(m_dBias, matrixPool);
|
||||
}
|
||||
|
||||
void SetNormalizationTimeConstants(double normalizationTimeConstant, double prevNormalizationTimeConstant,
|
||||
double blendTimeConstant, double prevBlendTimeConstant)
|
||||
|
@ -1851,6 +1886,20 @@ public:
|
|||
m_blendTimeConst = blendTimeConstant;
|
||||
}
|
||||
|
||||
// called from CloneFunction(..., parameters="constant")
|
||||
// Once called, this node is put into inference mode.
|
||||
virtual void FreezeParameters() override // from IFreezable
|
||||
{
|
||||
m_normTimeConst = std::numeric_limits<double>::infinity();
|
||||
m_blendTimeConst = std::numeric_limits<double>::infinity();
|
||||
}
|
||||
|
||||
double NormalizationTimeConstant() const { return m_normTimeConst; }
|
||||
double BlendTimeConstant() const { return m_blendTimeConst; }
|
||||
bool Spatial() const { return m_spatial; }
|
||||
double Epsilon() const { return m_epsilon; }
|
||||
bool UseCNTKEngine() const { return m_useCntkEngine; }
|
||||
|
||||
private:
|
||||
// Old versioning - do not use. Do not remove until we're sure there are no old models around.
|
||||
struct VersionInfo
|
||||
|
@ -1865,36 +1914,51 @@ private:
|
|||
VersionInfo m_version;
|
||||
|
||||
private:
|
||||
// --- configuration parameters
|
||||
|
||||
// Determines whether to use per-activation (used after non-convolutional layers like fully connected)
|
||||
// or spatial (used after convolutional layers).
|
||||
// TODO: This should not be a config option, but rather inferred from dimensions of the Parameters.
|
||||
bool m_spatial;
|
||||
// Time constant for running mean and variance.
|
||||
|
||||
// Time constant for estimating the running mean and variance.
|
||||
// This is the time constant of a low-pass filter.
|
||||
// If 0, running mean and variance just remember the last minibatch.
|
||||
// If infinity, running mean and variance are not updated, like in inference mode.
|
||||
double m_normTimeConst;
|
||||
// Time constant for blending running mean/var and current minibatch mean/var.
|
||||
// The main idea is to represent current minibatch statistics as MAP estimate, linear interpolation
|
||||
// of smoothed and minibatch statistics.
|
||||
|
||||
// Equivalent sample count for blending running mean/var and current minibatch mean/var.
|
||||
// Roughly, this specifies how many samples "worth" is the running statistics,
|
||||
// relative to the current minibatch statistics.
|
||||
// If 0, only use the current MB statistics. If infinity, use only the running mean, like in inference mode.
|
||||
// The main idea is to estimate the mean/variance as a MAP estimate using the running mean/var as a prrior.
|
||||
// This should make the method more robust to the case of very small minibatches,
|
||||
// and also provides a meaningful interpretation of inference mode, where only the prior is used.
|
||||
// Effectively, this ends up in a linear interpolation of running and minibatch statistics.
|
||||
// The idea is due to Frank Seide et al.
|
||||
// It should also work well in data parallelism scenario
|
||||
// as opposed to plain vanilla BN implementation which would require aggregation of statistics
|
||||
// from all nodes.
|
||||
// It should also work well in data parallelism scenario, as opposed to plain vanilla BN implementation
|
||||
// which would require aggregation of statistics from all nodes.
|
||||
// REVIEW alexeyk: if this works, document it properly in Wiki.
|
||||
double m_blendTimeConst;
|
||||
|
||||
// Epsilon used to compute inverse std deviation.
|
||||
double m_epsilon;
|
||||
// Whether to use CNTK or cuDNN BN implementation.
|
||||
bool m_useCntkEngine;
|
||||
// Layout (e.g. CHW).
|
||||
ImageLayoutKind m_imageLayoutKind;
|
||||
|
||||
// --- working variables
|
||||
|
||||
// Minibatch count, used to compute cumulative moving average.
|
||||
size_t m_mbCount;
|
||||
|
||||
// Stores pre-computed on forward pass mean values that are used in gradient computation.
|
||||
// Interpolated actual mean/stddev values. Pre-computed on forward pass, also used in gradient computation.
|
||||
shared_ptr<Matrix<ElemType>> m_saveMean;
|
||||
// Stores pre-computed on forward pass InvStdDev values that are used in gradient computation.
|
||||
shared_ptr<Matrix<ElemType>> m_saveInvStdDev;
|
||||
// Stores scale derivatives
|
||||
// Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls.
|
||||
// Not used for blendFactor=1 in CNTK engine.
|
||||
shared_ptr<Matrix<ElemType>> m_dScale;
|
||||
// Stores bias derivatives.
|
||||
shared_ptr<Matrix<ElemType>> m_dBias;
|
||||
|
||||
std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
|
||||
|
|
|
@ -321,15 +321,17 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
|
|||
RuntimeError("Expected %d outputs, but got %d.", (int)m_outputNodes.size(), (int)outputs.size());
|
||||
|
||||
size_t i = 0;
|
||||
for (auto& input : m_inputMatrices)
|
||||
for (auto& inputNode : m_inputNodes)
|
||||
{
|
||||
// const cast: The matrix class takes this over without copying and could theoretically change the contents,
|
||||
// though it doesn't in this case.
|
||||
auto& buffer = const_cast<ValueBuffer<ElemType, ValueContainer>&>(inputs[i]);
|
||||
shared_ptr<Matrix<ElemType>> matrix = dynamic_pointer_cast<Matrix<ElemType>>(input.second.matrix);
|
||||
auto matrix = dynamic_pointer_cast<Matrix<ElemType>>(inputNode->ValuePtr());
|
||||
auto type = matrix->GetMatrixType();
|
||||
size_t numRows = input.second.sampleLayout.GetNumElements();
|
||||
size_t numRows = inputNode->GetSampleLayout().GetNumElements();
|
||||
|
||||
if (buffer.m_buffer.data() == nullptr)
|
||||
RuntimeError("Input %ls: Buffer is not allocated.", m_inputNodes[i]->GetName().c_str());
|
||||
if (type == MatrixType::DENSE)
|
||||
{
|
||||
if (buffer.m_buffer.size() % numRows != 0)
|
||||
|
@ -340,8 +342,12 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
|
|||
}
|
||||
else if (type == MatrixType::SPARSE)
|
||||
{
|
||||
if (buffer.m_colIndices.data() == nullptr)
|
||||
RuntimeError("Input %ls: Due to sparse input format, expected colIndices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
|
||||
if (buffer.m_indices.data() == nullptr)
|
||||
RuntimeError("Input %ls: Due to sparse input format, expected Indices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
|
||||
if (buffer.m_colIndices.size() < 2)
|
||||
RuntimeError("Input %ls: Expected at least one element.", m_inputNodes[i]->GetName().c_str());
|
||||
RuntimeError("Input %ls: Expected at least one element (2 entries in colIndices array).", m_inputNodes[i]->GetName().c_str());
|
||||
if (buffer.m_colIndices[0] != 0)
|
||||
RuntimeError("Input %ls: First element of column indices must be 0", m_inputNodes[i]->GetName().c_str());
|
||||
if (buffer.m_colIndices[buffer.m_colIndices.size() - 1] != buffer.m_indices.size())
|
||||
|
@ -352,8 +358,8 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
|
|||
|
||||
int numCols = type == MatrixType::DENSE ? buffer.m_buffer.size() / numRows : buffer.m_colIndices.size() - 1;
|
||||
assert(numCols >= 1);
|
||||
input.second.pMBLayout->Init(1, numCols);
|
||||
input.second.pMBLayout->AddSequence(0, 0, 0, numCols);
|
||||
inputNode->GetMBLayout()->Init(1, numCols);
|
||||
inputNode->GetMBLayout()->AddSequence(0, 0, 0, numCols);
|
||||
|
||||
if (type == MatrixType::DENSE)
|
||||
matrix->SetValue(numRows, numCols, matrix->GetDeviceId(), buffer.m_buffer.data(), matrixFlagNormal);
|
||||
|
|
|
@ -14,6 +14,11 @@
|
|||
#include <msclr\marshal_cppstd.h>
|
||||
|
||||
#include "CNTKException.h"
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4793) // Function compiled as native
|
||||
#include "Basics.h"
|
||||
#include "ScriptableObjects.h"
|
||||
#pragma warning(pop)
|
||||
#include "EvalCommon.h"
|
||||
#include "Eval.h"
|
||||
|
||||
|
@ -250,7 +255,14 @@ public:
|
|||
outputNodeNames.push_back(context.marshal_as<std::wstring>(output));
|
||||
}
|
||||
|
||||
m_eval->StartForwardEvaluation(outputNodeNames);
|
||||
try
|
||||
{
|
||||
m_eval->StartForwardEvaluation(outputNodeNames);
|
||||
}
|
||||
catch (const exception& ex)
|
||||
{
|
||||
throw GetCustomException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -354,6 +366,11 @@ private:
|
|||
{
|
||||
return gcnew CNTKBadAllocException(gcnew System::String(ex.what()));
|
||||
}
|
||||
else if (dynamic_cast<const ScriptableObjects::ScriptingException*>(&ex) != nullptr) // Includes derived classes
|
||||
{
|
||||
const auto& err = dynamic_cast<const ScriptableObjects::ScriptingException&>(ex);
|
||||
return gcnew CNTKLogicErrorException(gcnew System::String(wstrprintf(L"%ls\n%ls", utf16(err.what()).c_str(), err.GetError(L"").c_str()).c_str()), nullptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
return gcnew CNTKException(gcnew System::String(ex.what()));
|
||||
|
|
|
@ -56,6 +56,8 @@
|
|||
</ClCompile>
|
||||
<Link>
|
||||
<AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>EvalDLL.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<DelayLoadDLLs>EvalDll.dll</DelayLoadDLLs>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(DebugBuild)">
|
||||
|
@ -66,10 +68,6 @@
|
|||
</ClCompile>
|
||||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<DelayLoadDLLs>
|
||||
</DelayLoadDLLs>
|
||||
<AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(ReleaseBuild)">
|
||||
|
@ -79,10 +77,6 @@
|
|||
</ClCompile>
|
||||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<DelayLoadDLLs>
|
||||
</DelayLoadDLLs>
|
||||
<AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Release|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
|
|
|
@ -25,8 +25,6 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
|
|||
assert(m_inOutT.GetNumElements() == bias.GetNumRows());
|
||||
assert(m_inOutT.GetNumElements() == runMean.GetNumRows());
|
||||
assert(m_inOutT.GetNumElements() == runInvStdDev.GetNumRows());
|
||||
assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -34,26 +32,35 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
|
|||
assert((m_inOutT.GetNumElements() % bias.GetNumRows()) == 0);
|
||||
assert((m_inOutT.GetNumElements() % runMean.GetNumRows()) == 0);
|
||||
assert((m_inOutT.GetNumElements() % runInvStdDev.GetNumRows()) == 0);
|
||||
assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
|
||||
}
|
||||
assert(scale.GetNumCols() == 1);
|
||||
assert(bias.GetNumCols() == 1);
|
||||
assert(runMean.GetNumCols() == 1);
|
||||
assert(runInvStdDev.GetNumCols() == 1);
|
||||
assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
|
||||
|
||||
EnsureCompatible();
|
||||
ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
|
||||
|
||||
if (!m_spatial)
|
||||
{
|
||||
assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
|
||||
}
|
||||
assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale,
|
||||
void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor,
|
||||
const Mat& saveMean, const Mat& saveInvStdDev, Mat& scaleGrad, Mat& biasGrad)
|
||||
{
|
||||
EnsureCompatible();
|
||||
BackwardCore(in, srcGrad, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
|
||||
BackwardCore(in, srcGrad, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -88,10 +95,10 @@ protected:
|
|||
in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
|
||||
}
|
||||
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad) override
|
||||
{
|
||||
srcGrad.BatchNormalizationBackward(in, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
|
||||
srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -128,4 +135,4 @@ std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEV
|
|||
template class BatchNormEngine<float>;
|
||||
template class BatchNormEngine<double>;
|
||||
|
||||
} } }
|
||||
}}}
|
||||
|
|
|
@ -37,7 +37,7 @@ public:
|
|||
void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
|
||||
|
||||
void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad);
|
||||
|
||||
static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
|
@ -55,10 +55,11 @@ protected:
|
|||
|
||||
virtual void EnsureCompatible() = 0;
|
||||
|
||||
// saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched
|
||||
virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
|
||||
|
||||
virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad) = 0;
|
||||
|
||||
protected:
|
||||
|
@ -70,4 +71,4 @@ protected:
|
|||
|
||||
#pragma warning(pop)
|
||||
|
||||
} } }
|
||||
}}}
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include <smmintrin.h>
|
||||
#include <assert.h>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
|
|
|
@ -32,8 +32,10 @@
|
|||
#include <vld.h>
|
||||
#endif
|
||||
|
||||
#pragma warning(disable : 4100) // unreferenced formal parameter; "struct TensorOpReduction<ElemType, OPFN, typename ReductionOp, N, -1>" trigger this
|
||||
#pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
|
||||
#pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons
|
||||
#pragma warning(disable : 4244) // unreachable code; triggered for unknown reasons
|
||||
#pragma warning(disable : 4702) // conversion from 'double' to 'float'
|
||||
|
||||
#ifdef USE_ACML
|
||||
// Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
|
||||
|
@ -4418,13 +4420,16 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
|
|||
CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev, CPUMatrix<ElemType>& out, double epsilon,
|
||||
CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const
|
||||
{
|
||||
UNUSED(epsilon); UNUSED(saveMean); UNUSED(saveInvStdDev);
|
||||
UNUSED(epsilon);
|
||||
|
||||
assert((GetNumRows() % scale.GetNumRows()) == 0);
|
||||
|
||||
if (expAvgFactor != 0 || blendFactor != 1)
|
||||
RuntimeError("Batch normalization training on CPU is not yet implemented.");
|
||||
|
||||
saveMean.Resize(0, 0); // only doing inference: these two are not produced
|
||||
saveInvStdDev.Resize(0, 0);
|
||||
|
||||
bool spatial = GetNumRows() != scale.GetNumRows();
|
||||
if (spatial)
|
||||
{
|
||||
|
@ -4453,10 +4458,11 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
|
||||
void CPUMatrix<ElemType>::BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor,
|
||||
const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
|
||||
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const
|
||||
{
|
||||
UNUSED(in); UNUSED(grad); UNUSED(scale); UNUSED(saveMean); UNUSED(saveInvStdDev); UNUSED(scaleGrad); UNUSED(biasGrad);
|
||||
UNUSED(in); UNUSED(grad); UNUSED(scale); UNUSED(blendFactor), UNUSED(saveMean); UNUSED(saveInvStdDev); UNUSED(scaleGrad); UNUSED(biasGrad);
|
||||
RuntimeError("Batch normalization training on CPU is not yet implemented.");
|
||||
}
|
||||
|
||||
|
@ -6042,35 +6048,38 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
|
|||
|
||||
// perform loop over reduction index m
|
||||
// This function is declared inside a wrapper struct to allow partial specialization (m = -1).
|
||||
template <class ElemType, typename OPFN, size_t N, int m>
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int m>
|
||||
struct TensorOpReduction
|
||||
{
|
||||
// reduction case (non-reduction case is specialized)
|
||||
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn,
|
||||
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
array<ptrdiff_t, N - 1> strides; // N-1 because last one is the result pointer, which is unused in reduction
|
||||
for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled
|
||||
strides[i] = reducingStrides[i][(size_t) m];
|
||||
double /*ElemType*/ aggregate = 0;
|
||||
for (size_t dim = reducingOpDims[(size_t) m]; dim-- > 0;)
|
||||
|
||||
double aggregate = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
|
||||
for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;)
|
||||
{
|
||||
// need to descend into one loop deeper
|
||||
aggregate += TensorOpReduction<ElemType, OPFN, N, m - 1>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
|
||||
// advance the pointers
|
||||
for (size_t i = 0; i < N - 1; i++)
|
||||
pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here
|
||||
|
||||
// need to descend into one loop deeper
|
||||
aggregate = reductionOp(aggregate, TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides));
|
||||
}
|
||||
return (ElemType) aggregate;
|
||||
// Actually it would be nicer to return double but we keep ElementType so that test don't return different numbers than previous implementation.
|
||||
return static_cast<double>(aggregate);
|
||||
}
|
||||
};
|
||||
|
||||
// perform loop over reduction index m
|
||||
// This is the specialized version for m = -1, which terminates the recursion.
|
||||
template <class ElemType, typename OPFN, size_t N>
|
||||
struct TensorOpReduction<ElemType, OPFN, N, -1>
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
|
||||
struct TensorOpReduction<ElemType, OPFN, ReductionOp, N, -1>
|
||||
{
|
||||
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn,
|
||||
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&)
|
||||
{
|
||||
return opfn(pointers); // finally we are doing some work!!!
|
||||
|
@ -6082,10 +6091,10 @@ struct TensorOpReduction<ElemType, OPFN, N, -1>
|
|||
// -----------------------------------------------------------------------
|
||||
|
||||
// perform loop over regular index k and reducing index m for N operands (counting the output)
|
||||
template <class ElemType, typename OPFN, size_t N, bool vectorizable, int m, int k>
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m, int k>
|
||||
struct TensorOpIteration
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn,
|
||||
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
|
@ -6096,7 +6105,7 @@ struct TensorOpIteration
|
|||
for (size_t dim = regularOpDims[(size_t) k]; dim-- > 0;)
|
||||
{
|
||||
// need to descend into one loop deeper
|
||||
TensorOpIteration<ElemType, OPFN, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
// advance the pointers
|
||||
for (size_t i = 0; i < N; i++)
|
||||
pointers[i] += strides[i];
|
||||
|
@ -6106,10 +6115,10 @@ struct TensorOpIteration
|
|||
|
||||
// Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE.
|
||||
// This is a very common case, e.g. adding vectors or computing the Sigmoid.
|
||||
template <class ElemType, typename OPFN>
|
||||
struct TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
|
||||
template <class ElemType, typename OPFN, typename ReductionOp>
|
||||
struct TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN& opfn,
|
||||
static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
|
||||
{
|
||||
|
@ -6121,25 +6130,25 @@ struct TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduc
|
|||
if (beta != 0)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else if (alpha != 1)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
// TODO: According to Amit, the VS compiler is not able to vectorize into lambdas. Solution: change the lambda to take an N, or to implement the loop inside (with 1 element by default).
|
||||
// TODO: The signedness of k (required for omp) causes an extra sign-extend.
|
||||
// TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it?
|
||||
}
|
||||
};
|
||||
// and unary
|
||||
template <class ElemType, typename OPFN>
|
||||
struct TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
|
||||
template <class ElemType, typename OPFN, typename ReductionOp>
|
||||
struct TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN& opfn,
|
||||
static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
|
||||
{
|
||||
|
@ -6150,27 +6159,27 @@ struct TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduc
|
|||
if (beta != 0)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else if (alpha != 1)
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else
|
||||
#pragma omp parallel for
|
||||
for (int k = 0; k < (int) K; k++)
|
||||
TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
};
|
||||
|
||||
template <class ElemType, typename OPFN, size_t N, bool vectorizable, int m>
|
||||
struct TensorOpIteration<ElemType, OPFN, N, vectorizable, m, -1>
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m>
|
||||
struct TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, -1>
|
||||
{
|
||||
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn,
|
||||
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
// we are at element level for the result: perform the op (there may still be reduction)
|
||||
ElemType val = TensorOpReduction<ElemType, OPFN, N, m>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
|
||||
ElemType val = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
|
||||
// scale
|
||||
val *= alpha;
|
||||
// combine with previous value in target matrix, then write it out
|
||||
|
@ -6188,8 +6197,8 @@ struct TensorOpIteration<ElemType, OPFN, N, vectorizable, m, -1>
|
|||
// -----------------------------------------------------------------------
|
||||
|
||||
// tensor operation with k+1 dimensions (-1 means scalar)
|
||||
template <class ElemType, typename OPFN, size_t N, int k>
|
||||
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, const OPFN& opfn,
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int k>
|
||||
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, const OPFN& opfn, ReductionOp reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
|
@ -6197,9 +6206,9 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
|
|||
switch (dims)
|
||||
{
|
||||
case 2:
|
||||
return TensorOpIteration<ElemType, OPFN, N, false /*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1:
|
||||
return TensorOpIteration<ElemType, OPFN, N, false /*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0:
|
||||
{
|
||||
// if all leading dimensions are 1, we can let the compiler do some unrolling
|
||||
|
@ -6207,9 +6216,9 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
|
|||
for (size_t i = 0; i < N; i++)
|
||||
leadingAllOne &= k >= 0 && regularStrides[i][0] == 1;
|
||||
if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions
|
||||
return TensorOpIteration<ElemType, OPFN, N, true /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, true /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
else
|
||||
return TensorOpIteration<ElemType, OPFN, N, false /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
default:
|
||||
LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int) dims);
|
||||
|
@ -6218,11 +6227,11 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
|
|||
|
||||
// tensor operation, generalized in number of arguments, operation already provided as a lambda
|
||||
// This function now expands into different k.
|
||||
template <class ElemType, typename OPFN, size_t N>
|
||||
static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn,
|
||||
const array<size_t, N>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
|
||||
static void TensorOpWithFnAndReduction(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
|
||||
const array<size_t, N>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
|
||||
pointers[i] += offsets[i];
|
||||
|
@ -6230,17 +6239,50 @@ static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType
|
|||
switch (dims)
|
||||
{
|
||||
case 4:
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, N, 3>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 3>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 3:
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, N, 2>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 2>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 2:
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, N, 1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1:
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, N, 0>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 0>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0:
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, N, -1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, -1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
default:
|
||||
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int) dims);
|
||||
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims);
|
||||
}
|
||||
}
|
||||
|
||||
// tensor operation, generalized in number of arguments, operation already provided as a lambda
|
||||
// This function now expands into different reductionOps
|
||||
template <class ElemType, typename OPFN, size_t N>
|
||||
static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, ElementWiseOperator reductionOp,
|
||||
const array<size_t, N>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
// BUGBUG: Using always 'double' as type of aggregator even for ElemType==float. Reason: otherwise some e2e test would fail as historically we
|
||||
// used double for aggregator of sum. But:
|
||||
// * for min and max reductions this is meaningless.
|
||||
// * It is not consitent with what we do on GPU, there we aggregate on ElemType.
|
||||
// * It costs performance.
|
||||
// TODO: apdapt e2e tests to run with aggregator of type ElemType.
|
||||
#define CaseTensorOpWithFnAndReduction(oper) \
|
||||
case ElementWiseOperator::op##oper: \
|
||||
return TensorOpWithFnAndReduction(beta, pointers, alpha, opfn, [](double a, double b) \
|
||||
{ \
|
||||
return Op##oper(a, b); \
|
||||
}, \
|
||||
offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
|
||||
switch (reductionOp)
|
||||
{
|
||||
CaseTensorOpWithFnAndReduction(Sum);
|
||||
CaseTensorOpWithFnAndReduction(LogSum);
|
||||
CaseTensorOpWithFnAndReduction(Min);
|
||||
CaseTensorOpWithFnAndReduction(Max);
|
||||
default:
|
||||
LogicError("Specified ElementWiseOperator op %d not suported as reduction operation.", (int)reductionOp);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6256,8 +6298,11 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
|
|||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
|
||||
{
|
||||
if (reductionOp != ElementWiseOperator::opSum) // TODO: enable the reduction ops
|
||||
InvalidArgument("TensorOp: Unary reduction operations other than opSum not yet implemented.");
|
||||
if (reductionOp != ElementWiseOperator::opSum &&
|
||||
reductionOp != ElementWiseOperator::opLogSum &&
|
||||
reductionOp != ElementWiseOperator::opMin &&
|
||||
reductionOp != ElementWiseOperator::opMax)
|
||||
InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
|
||||
|
||||
// TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
|
||||
#define CaseUnaryTensorOp(oper) \
|
||||
|
@ -6266,7 +6311,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
|
|||
{ \
|
||||
return Op##oper((*(pp[0]))); \
|
||||
}, \
|
||||
offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
|
||||
array<ElemType*, 2> pointers = {a.Data(), Data()};
|
||||
switch (op)
|
||||
|
@ -6294,7 +6339,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
|
|||
{ \
|
||||
return Op##oper((*(pp[0])), (*(pp[1]))); \
|
||||
}, \
|
||||
offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
|
||||
array<ElemType*, 3> pointers = {a.Data(), b.Data(), Data()};
|
||||
switch (op)
|
||||
|
@ -6322,7 +6367,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
|
|||
{ \
|
||||
return Op##oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); \
|
||||
}, \
|
||||
offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
|
||||
|
||||
array<ElemType*, 4> pointers = {a.Data(), b.Data(), c.Data(), Data()};
|
||||
switch (op)
|
||||
|
@ -6359,11 +6404,33 @@ template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
|
|||
template void CPUMatrix<char>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly);
|
||||
template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
|
||||
template char* CPUMatrix<char>::CopyToArray(void) const;
|
||||
|
||||
template void CPUMatrix<char>::CopySection(size_t numRows, size_t numCols, char* dst, size_t colStride) const;
|
||||
template void CPUMatrix<char>::Reshape(const size_t, const size_t);
|
||||
|
||||
// Support <short>
|
||||
template CPUMatrix<short>::CPUMatrix(const size_t numRows, const size_t numCols);
|
||||
template CPUMatrix<short>::CPUMatrix(const size_t numRows, const size_t numCols, short* pArray, const size_t matrixFlags);
|
||||
template CPUMatrix<short>::CPUMatrix();
|
||||
template CPUMatrix<short>::CPUMatrix(CPUMatrix<short> const&);
|
||||
template CPUMatrix<short>::CPUMatrix(CPUMatrix<short>&&);
|
||||
template size_t CPUMatrix<short>::LocateElement(size_t, size_t) const;
|
||||
template CPUMatrix<short>::~CPUMatrix();
|
||||
template CPUMatrix<short> CPUMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
|
||||
template CPUMatrix<short>& CPUMatrix<short>::operator=(CPUMatrix<short>&&);
|
||||
template void CPUMatrix<short>::SetValue(const short);
|
||||
template void CPUMatrix<short>::SetValue(const size_t numRows, const size_t numCols, short* pArray, size_t matrixFlags);
|
||||
template void CPUMatrix<short>::SetValue(CPUMatrix<short> const&);
|
||||
//template void CPUMatrix<short>::SetValue(GPUMatrix<short> const&);
|
||||
//template void CPUMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
|
||||
//template void CPUMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
|
||||
template void CPUMatrix<short>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly);
|
||||
template void CPUMatrix<short>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
|
||||
template short* CPUMatrix<short>::CopyToArray(void) const;
|
||||
template void CPUMatrix<short>::CopySection(size_t numRows, size_t numCols, short* dst, size_t colStride) const;
|
||||
template void CPUMatrix<short>::Reshape(const size_t, const size_t);
|
||||
|
||||
template CPUMatrix<int>::CPUMatrix(const size_t, const size_t, int*, const size_t);
|
||||
template CPUMatrix<int>::~CPUMatrix();
|
||||
|
||||
}}}
|
||||
|
||||
|
|
|
@ -375,7 +375,7 @@ public:
|
|||
|
||||
void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev,
|
||||
CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
|
||||
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
|
||||
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
|
||||
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
|
||||
|
||||
public:
|
||||
|
|
|
@ -781,6 +781,7 @@ void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPU
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: Implement CSR as a transposition of b, like we do for GPU.
|
||||
if (rhs.GetFormat() != matrixFormatSparseCSC)
|
||||
NOT_IMPLEMENTED;
|
||||
|
||||
|
@ -820,13 +821,42 @@ void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPU
|
|||
}
|
||||
}
|
||||
}
|
||||
// the transposeA case is copy-paste from above with rows/cols of lhs swapped
|
||||
else if (transposeA && !transposeB)
|
||||
{
|
||||
NOT_IMPLEMENTED;
|
||||
for (size_t j = 0; j < rhs.GetNumCols(); j++)
|
||||
{
|
||||
size_t start = rhs.SecondaryIndexLocation()[j]; // ColLocation
|
||||
size_t end = rhs.SecondaryIndexLocation()[j + 1];
|
||||
for (size_t p = start; p < end; p++)
|
||||
{
|
||||
size_t i = rhs.MajorIndexLocation()[p]; // RowLocation
|
||||
ElemType val = rhs.Buffer()[p];
|
||||
|
||||
for (size_t h = 0; h < lhs.GetNumCols(); h++)
|
||||
{
|
||||
c(h, j) += alpha * lhs(i, h) * val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (transposeA && transposeB)
|
||||
{
|
||||
NOT_IMPLEMENTED;
|
||||
for (size_t j = 0; j < rhs.GetNumCols(); j++)
|
||||
{
|
||||
size_t start = rhs.SecondaryIndexLocation()[j];
|
||||
size_t end = rhs.SecondaryIndexLocation()[j + 1];
|
||||
|
||||
for (size_t p = start; p < end; p++)
|
||||
{
|
||||
size_t i = rhs.MajorIndexLocation()[p];
|
||||
ElemType val = rhs.Buffer()[p];
|
||||
for (size_t h = 0; h < lhs.GetNumCols(); h++)
|
||||
{
|
||||
c(h, i) += alpha * lhs(j, h) * val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1475,6 +1505,29 @@ template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t st
|
|||
template void CPUSparseMatrix<char>::AssignColumnSliceToDense(CPUMatrix<char>&, size_t startColumn, size_t numCols) const;
|
||||
template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);
|
||||
|
||||
// Support <short>
|
||||
template CPUSparseMatrix<short>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
|
||||
template CPUSparseMatrix<short>::CPUSparseMatrix(MatrixFormat);
|
||||
template CPUSparseMatrix<short>::CPUSparseMatrix(CPUSparseMatrix<short> const&);
|
||||
template CPUSparseMatrix<short>::CPUSparseMatrix(CPUSparseMatrix<short>&&);
|
||||
template CPUSparseMatrix<short>& CPUSparseMatrix<short>::operator=(CPUSparseMatrix<short>&& moveFrom);
|
||||
template void CPUSparseMatrix<short>::SetValue(size_t, size_t, short);
|
||||
//template void CPUSparseMatrix<short>::SetValue(CPUMatrix<short> const&);
|
||||
//template void CPUSparseMatrix<short>::SetValue(GPUMatrix<short> const&);
|
||||
template void CPUSparseMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
|
||||
//template void CPUSparseMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
|
||||
template short* CPUSparseMatrix<short>::Data() const;
|
||||
template short* CPUSparseMatrix<short>::Data();
|
||||
template void CPUSparseMatrix<short>::Reset(void);
|
||||
template void CPUSparseMatrix<short>::Resize(const size_t, const size_t, const size_t, const bool);
|
||||
template void CPUSparseMatrix<short>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, bool);
|
||||
template void CPUSparseMatrix<short>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const MatrixFormat, const bool, bool);
|
||||
template CPUSparseMatrix<short>::~CPUSparseMatrix();
|
||||
template CPUSparseMatrix<short> CPUSparseMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
|
||||
template CPUMatrix<short> CPUSparseMatrix<short>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
|
||||
template void CPUSparseMatrix<short>::AssignColumnSliceToDense(CPUMatrix<short>&, size_t startColumn, size_t numCols) const;
|
||||
template CPUSparseMatrix<short>& CPUSparseMatrix<short>::operator=(const CPUSparseMatrix<short>& deepCopyFrom);
|
||||
|
||||
template CPUSparseMatrix<int>::CPUSparseMatrix(const MatrixFormat, const size_t, const size_t, const size_t);
|
||||
template CPUSparseMatrix<int>::~CPUSparseMatrix();
|
||||
|
||||
|
|
|
@ -110,11 +110,11 @@ __device__ __forceinline__ T Shuffle(T input, int srcLane)
|
|||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
// shfl is supported only on Kepler+
|
||||
static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer");
|
||||
static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer.");
|
||||
return cub::ShuffleIndex(input, srcLane);
|
||||
#else
|
||||
assert(false);
|
||||
return input;
|
||||
return input; // keep compiler happy
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -163,8 +163,12 @@ void Call(size_t vectorSize, Targs... args)
|
|||
// As a result, each block has 2 * blockDim.x (mean and inverse stddev) values to write at the end.
|
||||
//
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
|
||||
__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
|
||||
double epsilon, ElemType* xMean, ElemType* xInvStdDev)
|
||||
__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
||||
const ElemType* x, // (in) input data
|
||||
double expAvgFactor,
|
||||
ElemType* runMean, ElemType* runInvStdDev, // (in/out) running mean/stddev, gets updated with current minibatch
|
||||
double epsilon,
|
||||
ElemType* xMean, ElemType* xInvStdDev) // (out) this minibatch's mean
|
||||
{
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
|
@ -182,9 +186,12 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
|
|||
return;
|
||||
assert(irowSrcBase + U <= vectorSize);
|
||||
|
||||
// --- estimate this minibatch's mean/stddev
|
||||
|
||||
// first estimate mean over all data for this thread
|
||||
int n = 0;
|
||||
ElemType mean[U];
|
||||
ElemType m2[U];
|
||||
ElemType mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
|
||||
ElemType m2[U]; // likewise for stdev
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
|
@ -207,12 +214,13 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
|
|||
ElemType d = curVal[k] - mean[k];
|
||||
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
|
||||
// Using precise math slows down the code by about 40%.
|
||||
mean[k] += d / n;
|
||||
mean[k] += d / n; // mean_n = [mean_{n-1} * (n-1) + curVal] / n = mean_{n-1} *n/n - mean_{n-1} / n + curVal / n
|
||||
m2[k] += d * (curVal[k] - mean[k]);
|
||||
}
|
||||
psrc += vectorSize * BlockDimY;
|
||||
}
|
||||
|
||||
// now reduce minibatch mean/stddev across threads
|
||||
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
|
||||
const int laneId = tid & 0x1f;
|
||||
// First, reduce within warp using shuffle.
|
||||
|
@ -259,6 +267,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
|
|||
}
|
||||
__syncthreads();
|
||||
|
||||
// --- final reduction and update of running mean/stddev
|
||||
|
||||
// Accumulate and write final results.
|
||||
// REVIEW alexeyk: see if atomicAdd can be used instead, do perf comparison.
|
||||
if (threadIdx.y == 0)
|
||||
|
@ -283,7 +293,10 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
|
|||
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
|
||||
// Store mean and running mean.
|
||||
StoreValues<U>(mean, xMean + idxDstBase);
|
||||
if (expAvgFactor == 1)
|
||||
// at this point, minibatch mean has been saved into xMean[]
|
||||
|
||||
// accumulate running mean
|
||||
if (expAvgFactor == 1) // 100% comes from current minibatch, nothing from history
|
||||
StoreValues<U>(mean, runMean + idxDstBase);
|
||||
else
|
||||
{
|
||||
|
@ -294,6 +307,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
|
|||
run[k] = expAvgFactor * mean[k] + (1.0 - expAvgFactor) * run[k];
|
||||
StoreValues<U>(run, runMean + idxDstBase);
|
||||
}
|
||||
// at this point, runMean[] has been updated
|
||||
|
||||
// Store inv std dev and its running version.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
|
@ -301,6 +316,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
|
|||
m2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
|
||||
}
|
||||
StoreValues<U>(m2, xInvStdDev + idxDstBase);
|
||||
// at this point, minibatch stddev has been saved into xInvStdDev[]
|
||||
|
||||
if (expAvgFactor == 1)
|
||||
StoreValues<U>(m2, runInvStdDev + idxDstBase);
|
||||
else
|
||||
|
@ -312,6 +329,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
|
|||
run[k] = expAvgFactor * m2[k] + (1.0 - expAvgFactor) * run[k];
|
||||
StoreValues<U>(run, runInvStdDev + idxDstBase);
|
||||
}
|
||||
// at this point, runInvStdDev[] has been updated
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -467,8 +485,13 @@ template <int U>
|
|||
struct ComputeBatchMeanAndInvStdDev
|
||||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
|
||||
double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
|
||||
static void Call(size_t vectorSize, size_t batchSize,
|
||||
const ElemType* x, // (in) input data
|
||||
double expAvgFactor,
|
||||
ElemType* runMean, ElemType* runInvStdDev, // (in/out) running mean/stddev, gets updated with current minibatch
|
||||
double epsilon,
|
||||
ElemType* xMean, ElemType* xInvStdDev, // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
|
||||
cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
|
||||
|
@ -594,8 +617,11 @@ template <int U>
|
|||
struct NormalizeBatchTraining
|
||||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, ElemType* y,
|
||||
const ElemType* bnScale, const ElemType* bnBias, const ElemType* batchMean, const ElemType* batchInvStdDev, cudaStream_t stream)
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial,
|
||||
const ElemType* x, ElemType* y, // (in, out) data to normalize -> normalized data
|
||||
const ElemType* bnScale, const ElemType* bnBias, // (in) scale/bias to denormalize with
|
||||
const ElemType* batchMean, const ElemType* batchInvStdDev, // (in) actual mean/stddev to normalize with
|
||||
cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
|
||||
|
@ -839,7 +865,7 @@ struct ComputeSpatialScaleAndBiasGradients
|
|||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((spatialSize % U) == 0);
|
||||
assert((vectorSize % spatialSize) == 0);
|
||||
|
@ -854,9 +880,10 @@ struct ComputeSpatialScaleAndBiasGradients
|
|||
}
|
||||
};
|
||||
|
||||
// mbStatsWeight is the weight with which current MB's stats were used (0 means not at all, locked model).
|
||||
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
|
||||
__global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
|
||||
const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias,
|
||||
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
|
||||
const ElemType* saveMean, const ElemType* saveInvStdDev)
|
||||
{
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
|
@ -917,18 +944,29 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
|
|||
LoadValues<U>(pdy, dyCur);
|
||||
LoadValues<U>(pdx, dxCur);
|
||||
// From the BN paper, dL/dxi is a sum of three terms: dL/dxi = t1 + t2 + t3
|
||||
// After simplifcation, they become the following:
|
||||
// 1. t1 = scale * dL/dyi * invStdDev
|
||||
// 2. t2 = (-scale / m) * invStdDev * xHat * dL/dScale
|
||||
// 3. t3 = (-scale / m) * invStdDev * dL/dBias (for this one note that Sum(xHat) == 0)
|
||||
// The formulas for dBias and dScale happen to occur as subexpressions in this gradient as well.
|
||||
// Leveraging this, this gradient can be simplified to:
|
||||
// t1 = scale * dL/dyi * invStdDev
|
||||
// t2 = mbStatsWeight * (-scale / m) * invStdDev * xHat * dL/dScale
|
||||
// t3 = mbStatsWeight * (-scale / m) * invStdDev * dL/dBias (for this one note that Sum(xHat) == 0)
|
||||
// with
|
||||
// dBias = Reduce(dy)
|
||||
// dScale = Reduce(dy * xHat)
|
||||
// Simplifying this a bit more, we get the formula below.
|
||||
ElemType val[U];
|
||||
int m = Spatial ? batchSize * spatialSize : batchSize;
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k];
|
||||
val[k] = dxCur[k] + (scale[k] * invStdDev[k]) * (dyCur[k] - (xNorm * ds[k] + db[k]) / m);
|
||||
ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k]; // xHat
|
||||
// scale * invStdDev * (
|
||||
// dL/dyi
|
||||
// - mbStatsWeight * (xHat * dL/dScale + dL/dBias) / m
|
||||
// )
|
||||
val[k] = dxCur[k] // (adding to gradient)
|
||||
+ (scale[k] * invStdDev[k]) * (
|
||||
dyCur[k]
|
||||
- mbStatsWeight * (xNorm * ds[k] + db[k]) / m);
|
||||
}
|
||||
StoreValues<U>(val, pdx);
|
||||
}
|
||||
|
@ -939,25 +977,26 @@ struct BackpropagateBatchNormGradients
|
|||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
|
||||
const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
|
||||
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
|
||||
const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
const int BlockDimX = 32 / U;
|
||||
const int BlockDimY = 4 * U;
|
||||
auto bdim = dim3(BlockDimX, BlockDimY);
|
||||
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)),
|
||||
static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
|
||||
static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
|
||||
if (spatial)
|
||||
{
|
||||
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
|
||||
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
|
||||
}
|
||||
else
|
||||
{
|
||||
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
|
||||
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false/*not spatial*/, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} } }
|
||||
}}}
|
||||
|
|
|
@ -96,7 +96,8 @@ enum ElementWiseOperator
|
|||
opCond /*a ? b : c*/,
|
||||
opClip, /*clip a within interval b..c*/
|
||||
opElementwiseProductWithLogSumDerivative,
|
||||
opCopyIfEqual
|
||||
opCopyIfEqual,
|
||||
opElementwiseProductWithExpOfDiff, /* a * exp(b - c) */
|
||||
// Note: not all that's implemented in CNTK ComputationNodes has an opcode yet.
|
||||
};
|
||||
|
||||
|
@ -157,7 +158,8 @@ enum ElementWiseOperator
|
|||
Macro(Cond); \
|
||||
Macro(CopyIfEqual); \
|
||||
Macro(Clip); \
|
||||
Macro(ElementwiseProductWithLogSumDerivative);
|
||||
Macro(ElementwiseProductWithLogSumDerivative); \
|
||||
Macro(ElementwiseProductWithExpOfDiff);
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// various enums to describe
|
||||
|
|
|
@ -53,32 +53,37 @@ protected:
|
|||
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
// cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
|
||||
epsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
|
||||
// expAvgFactor == 0 && blendFactor == 1 means we are in eval mode.
|
||||
// expAvgFactor == 0 && blendFactor == 1 means we are in inference mode.
|
||||
if (expAvgFactor == 0 && blendFactor == 1)
|
||||
{
|
||||
saveMean.Resize(0, 0); // (these are not produced in this case)
|
||||
saveInvStdDev.Resize(0, 0);
|
||||
CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out),
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), epsilon));
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), epsilon));
|
||||
}
|
||||
else
|
||||
{
|
||||
saveMean.Resize(runMean);
|
||||
saveInvStdDev.Resize(runMean);
|
||||
CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in),
|
||||
m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
|
||||
epsilon, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
|
||||
epsilon, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
}
|
||||
}
|
||||
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad) override
|
||||
{
|
||||
UNUSED(blendFactor); // BUGBUG: It should be used.
|
||||
m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
|
||||
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
// REVIEW alexeyk: remove once Philly is upgraded to prod version. Also change betaParamDiff to 1 and update CNTK BN engine.
|
||||
#if CUDNN_MAJOR >= 5 || (CUDNN_MAJOR == 4 && CUDNN_PATCHLEVEL >= 7)
|
||||
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
#else
|
||||
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -116,6 +116,44 @@ const char* CudaErrString<curandStatus>(curandStatus)
|
|||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
/*static*/ bool SyncGuard::s_isSyncEnabled = false;
|
||||
|
||||
/*static*/ void SyncGuard::EnableSync()
|
||||
{
|
||||
s_isSyncEnabled = true;
|
||||
}
|
||||
|
||||
SyncGuard::SyncGuard(bool forceSync /*= false*/)
|
||||
: m_forceSync(forceSync)
|
||||
{
|
||||
m_done = nullptr;
|
||||
if (m_forceSync || s_isSyncEnabled)
|
||||
{
|
||||
CUDA_CALL(cudaGetLastError());
|
||||
CUDA_CALL(cudaEventCreate(&m_done));
|
||||
}
|
||||
}
|
||||
|
||||
SyncGuard::~SyncGuard()
|
||||
{
|
||||
if (m_forceSync || s_isSyncEnabled)
|
||||
{
|
||||
// The regular use of this destructor is to synchronize the GPU, but also
|
||||
// to check for errors. So this destructor is where CUDA errors would be thrown.
|
||||
// If this destructor runs during stack unwinding, then a different error has
|
||||
// already happened that should be reported; so we only clean up the resource.
|
||||
if (std::uncaught_exception())
|
||||
cudaEventDestroy(m_done);
|
||||
else
|
||||
{
|
||||
// failures in a prior launch might be reported here
|
||||
CUDA_CALL(cudaEventRecord(m_done));
|
||||
CUDA_CALL(cudaEventSynchronize(m_done));
|
||||
CUDA_CALL(cudaEventDestroy(m_done));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename AllocatedElemType>
|
||||
AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numRows, size_t numCols)
|
||||
{
|
||||
|
@ -1911,7 +1949,8 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
|
|||
while (p / 2 > width)
|
||||
p = p / 2;
|
||||
|
||||
_computeNceOutput<ElemType><<<GetNumElements() / 2, p>>>(
|
||||
// note: kernel has hard-coded dimension of 512
|
||||
_computeNceOutputMax512Threads<ElemType> << <GetNumElements() / 2, p >> >(
|
||||
Data(),
|
||||
sampleCount,
|
||||
m_numRows / 2,
|
||||
|
@ -1925,7 +1964,8 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
|
|||
while (p / 2 > GetNumElements() / 2)
|
||||
p = p / 2;
|
||||
// summing up objective must be done in one block
|
||||
_assignNoiseContrastiveEstimation<ElemType><<<1, p>>>(
|
||||
// note: kernel has hard-coded dimension of 512
|
||||
_assignNoiseContrastiveEstimationMax512Threads<ElemType> << <1, p >> >(
|
||||
Data(),
|
||||
sampleCount,
|
||||
m_numRows / 2,
|
||||
|
@ -1970,7 +2010,8 @@ void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatr
|
|||
while (p / 2 > width)
|
||||
p = p / 2;
|
||||
|
||||
_assignSoftmaxSum<ElemType><<<1, p>>>(
|
||||
// note: kernel has hard-coded dimension of 512
|
||||
_assignSoftmaxSumMax512Threads<ElemType> << <1, p >> >(
|
||||
my_a.Data(),
|
||||
width,
|
||||
Data(),
|
||||
|
@ -2046,7 +2087,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignLogSoftmaxOf(const GPUMatrix<Ele
|
|||
CUDA_LONG N = (CUDA_LONG) GetNumCols();
|
||||
CUDA_LONG M = (CUDA_LONG) GetNumRows();
|
||||
SyncGuard syncGuard;
|
||||
_assignColumnwiseLogSoftmaxOf<<<N, 512, 0, t_stream>>>(a.Data(), Data(), N, M);
|
||||
// note: kernel uses hard-coded thread dimension
|
||||
_assignColumnwiseLogSoftmaxOf512Threads<<<N, 512, 0, t_stream>>>(a.Data(), Data(), N, M);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -2072,7 +2114,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignHardmaxOf(const GPUMatrix<ElemTy
|
|||
CUDA_LONG N = (CUDA_LONG) GetNumCols();
|
||||
CUDA_LONG M = (CUDA_LONG) GetNumRows();
|
||||
SyncGuard syncGuard;
|
||||
_assignColumnwiseHardmaxOf<<<N, 512, 0, t_stream>>>(a.Data(), Data(), N, M);
|
||||
// note: kernel uses hard-coded thread dimension
|
||||
_assignColumnwiseHardmaxOf512Threads << <N, 512, 0, t_stream >> >(a.Data(), Data(), N, M);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -2224,7 +2267,8 @@ ElemType GPUMatrix<ElemType>::SumOfElements() const
|
|||
ElemType h_sum;
|
||||
|
||||
// WARNING: THIS kernel is not the most efficient way!
|
||||
_reductionSum<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_sum, (CUDA_LONG) GetNumElements());
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_reductionSum1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_sum, (CUDA_LONG)GetNumElements());
|
||||
CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);
|
||||
return h_sum;
|
||||
|
@ -2241,7 +2285,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOfElements(const GPUMatrix<El
|
|||
PrepareDevice();
|
||||
SyncGuard syncGuard;
|
||||
// WARNING: THIS kernel is not the most efficient way!
|
||||
_reductionSumAndAssign<ElemType><<<1, 1024>>>(Data(), a.Data(), (CUDA_LONG) a.GetNumElements(), (CUDA_LONG) GetNumElements());
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_reductionSumAndAssign1024Threads<ElemType> << <1, 1024 >> >(Data(), a.Data(), (CUDA_LONG)a.GetNumElements(), (CUDA_LONG)GetNumElements());
|
||||
return (*this);
|
||||
}
|
||||
|
||||
|
@ -2253,7 +2298,8 @@ DeviceBoundNumber<ElemType> GPUMatrix<ElemType>::Sum_AsDeviceBoundNum() const
|
|||
ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
|
||||
|
||||
// WARNING: THIS kernel is not the most efficient way!
|
||||
_reductionSum<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_sum, (CUDA_LONG) GetNumElements());
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_reductionSum1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_sum, (CUDA_LONG)GetNumElements());
|
||||
DeviceBoundNumber<ElemType> result;
|
||||
result.ShallowCopyFrom(d_sum, GetComputeDeviceId());
|
||||
return result;
|
||||
|
@ -2555,7 +2601,8 @@ ElemType GPUMatrix<ElemType>::FrobeniusNorm() const
|
|||
|
||||
ElemType h_sum = 0;
|
||||
// WARNING: THIS kernel is not the most efficient way!
|
||||
_reductionSum2<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_sum, (CUDA_LONG) GetNumElements(), true);
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_reductionSum21024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_sum, (CUDA_LONG)GetNumElements(), true);
|
||||
CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);
|
||||
|
||||
|
@ -2572,7 +2619,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignFrobeniusNormOf(const GPUMatrix<
|
|||
|
||||
PrepareDevice();
|
||||
// WARNING: THIS kernel is not the most efficient way!
|
||||
_reductionSum2<ElemType><<<1, 1024, 0, t_stream>>>(a.Data(), Data(), (CUDA_LONG) a.GetNumElements(), true);
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_reductionSum21024Threads<ElemType> << <1, 1024, 0, t_stream >> >(a.Data(), Data(), (CUDA_LONG)a.GetNumElements(), true);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
@ -2581,13 +2629,14 @@ template <class ElemType>
|
|||
ElemType GPUMatrix<ElemType>::MatrixNormInf() const
|
||||
{
|
||||
if (IsEmpty())
|
||||
LogicError("MatrixNorm1: Matrix is empty.");
|
||||
LogicError("MatrixNormInf: Matrix is empty.");
|
||||
|
||||
ElemType* d_maxAbs = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
|
||||
|
||||
ElemType h_maxAbs = 0;
|
||||
// WARNING: THIS kernel is not the most efficient way!
|
||||
_reductionMatrixNormInf<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_maxAbs, (CUDA_LONG) GetNumElements());
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_reductionMatrixNormInf1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_maxAbs, (CUDA_LONG)GetNumElements());
|
||||
CUDA_CALL(cudaMemcpy(&h_maxAbs, d_maxAbs, sizeof(ElemType), cudaMemcpyDeviceToHost));
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_maxAbs);
|
||||
return h_maxAbs;
|
||||
|
@ -2610,7 +2659,8 @@ ElemType GPUMatrix<ElemType>::MatrixNorm0() const
|
|||
ElemType* d_nz = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
|
||||
ElemType h_nz = 0;
|
||||
// WARNING: THIS kernel is not the most efficient way!
|
||||
_reductionMatrixNorm0<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_nz, (CUDA_LONG) GetNumElements());
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_reductionMatrixNorm01024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_nz, (CUDA_LONG)GetNumElements());
|
||||
CUDA_CALL(cudaMemcpy(&h_nz, d_nz, sizeof(ElemType), cudaMemcpyDeviceToHost));
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_nz);
|
||||
return h_nz;
|
||||
|
@ -2667,7 +2717,8 @@ void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<E
|
|||
maxIndexes.RequireSize(1, n);
|
||||
|
||||
int blocksPerGrid = n; // we'll have 1 block processing 1 column
|
||||
_vectorMaxMinReduce<ElemType, true><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(us.Data(), maxIndexes.Data(), maxValues.Data(), m, n);
|
||||
// note: kernel has hard-coded dimension of 512
|
||||
_vectorMaxMinReduce512Threads<ElemType, true><<<blocksPerGrid, 512, 0, t_stream>>>(us.Data(), maxIndexes.Data(), maxValues.Data(), m, n);
|
||||
|
||||
/*int blocksPerGrid=(int)ceil(1.0*n/GridDim::maxThreadsPerBlock);
|
||||
_vectorMax<ElemType><<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(us.Data(),maxIndexes.Data(),maxValues.Data(),m,n,isColWise);*/
|
||||
|
@ -2793,7 +2844,8 @@ void GPUMatrix<ElemType>::VectorMin(GPUMatrix<ElemType>& minIndexes, GPUMatrix<E
|
|||
minIndexes.RequireSize(1, n);
|
||||
|
||||
int blocksPerGrid = n; // we'll have 1 block processing 1 column
|
||||
_vectorMaxMinReduce<ElemType, false><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(us.Data(), minIndexes.Data(), minValues.Data(), m, n);
|
||||
// note: kernel has hard-coded dimension of 512
|
||||
_vectorMaxMinReduce512Threads<ElemType, false> << <blocksPerGrid, 512, 0, t_stream >> >(us.Data(), minIndexes.Data(), minValues.Data(), m, n);
|
||||
|
||||
/*
|
||||
int blocksPerGrid=(int)ceil(1.0*n/GridDim::maxThreadsPerBlock);
|
||||
|
@ -2823,8 +2875,9 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemTy
|
|||
if (!searchInCol)
|
||||
{
|
||||
// int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/GridDim::maxThreadsPerBlock);
|
||||
// _assignNumOfDiff<ElemType><<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(a.Data(), b.Data(), Data(), a.GetNumElements());
|
||||
_assignNumOfDiff<ElemType><<<1, 1024, 0, t_stream>>>(a.Data(), b.Data(), Data(), (CUDA_LONG) a.GetNumElements());
|
||||
// _assignNumOfDiff1024Threads<ElemType><<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(a.Data(), b.Data(), Data(), a.GetNumElements());
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_assignNumOfDiff1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(a.Data(), b.Data(), Data(), (CUDA_LONG)a.GetNumElements());
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -3107,6 +3160,7 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
|
|||
Data(), (int)GetNumRows(), grad.Data(), (int)grad.GetNumRows());
|
||||
}
|
||||
|
||||
// returns saveMean/saveInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
|
||||
|
@ -3122,10 +3176,13 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
|
|||
assert(0 < vectorSize && vectorSize <= std::numeric_limits<int>::max());
|
||||
assert(0 < batchSize && batchSize <= std::numeric_limits<int>::max());
|
||||
|
||||
// --- compute data mean/stddev (into saveMean/saveInvStdDev) and update running mean/stddev
|
||||
SyncGuard syncGuard;
|
||||
// If expAvgFactor == 0 && blendFactor == 1 then we don't need to compute current minibatch statistics.
|
||||
if (expAvgFactor > 0 || blendFactor < 1)
|
||||
{
|
||||
saveMean.RequireSize(runMean);
|
||||
saveInvStdDev.RequireSize(runMean);
|
||||
if (spatial)
|
||||
{
|
||||
Call<ComputeSpatialBatchMeanAndInvStdDev, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, Data(),
|
||||
|
@ -3139,35 +3196,50 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
|
|||
saveMean.Data(), saveInvStdDev.Data(), GetStream());
|
||||
}
|
||||
}
|
||||
else // not computing new statistics
|
||||
{
|
||||
saveMean.RequireSize(0, 0);
|
||||
saveInvStdDev.RequireSize(0, 0);
|
||||
}
|
||||
|
||||
// --- apply MAP estimates of mean/stddev (interpolation of data and running mean/stddev) to data
|
||||
// When:
|
||||
// blendFactor == 1 - use running mean/var instead of the current minibatch mean/var.
|
||||
// blendFactor == 1 - use running mean/var instead of the current minibatch mean/var. Note: saveMean/saveInvStdDev are NOT produced.
|
||||
// 0 < blendFactor < 1 - blend running mean/var with mean/var of the current minibatch: saveMean = (1 - blendFactor) * saveMean + blendFactor * runMean
|
||||
// blendFactor == 0 - use mean/var of the current minibatch.
|
||||
if (blendFactor < 1)
|
||||
{
|
||||
// non-zero blendFactor: interpolate minibatch mean/stddev in-place with running mean/stddev
|
||||
if (blendFactor > 0)
|
||||
{
|
||||
// REVIEW alexeyk: can be rolled into NormalizeBatchTraining to save bandwidth.
|
||||
// TODO: add a 'beta' parameter to ScaleAndAdd()
|
||||
Scale((ElemType)(1 - blendFactor), saveMean);
|
||||
ScaleAndAdd((ElemType)blendFactor, runMean, saveMean);
|
||||
ScaleAndAdd((ElemType)blendFactor, /*in*/ runMean, /*in/out*/ saveMean);
|
||||
Scale((ElemType)(1 - blendFactor), saveInvStdDev);
|
||||
ScaleAndAdd((ElemType)blendFactor, runInvStdDev, saveInvStdDev);
|
||||
}
|
||||
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
|
||||
spatial, Data(), out.Data(), scale.Data(), bias.Data(),
|
||||
saveMean.Data(), saveInvStdDev.Data(), GetStream());
|
||||
// normalize
|
||||
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
|
||||
Data(), out.Data(), // (in, out) data to be normalized -> normalized data
|
||||
scale.Data(), bias.Data(), // (in) scale/bias to denormalize with
|
||||
/*(in)*/saveMean.Data(), saveInvStdDev.Data(), // (in) actual mean/stddev to normalize with
|
||||
GetStream());
|
||||
}
|
||||
else
|
||||
else // blendFactor == 1: use running mean/stddev only
|
||||
{
|
||||
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
|
||||
spatial, Data(), out.Data(), scale.Data(), bias.Data(),
|
||||
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
|
||||
Data(), out.Data(),
|
||||
scale.Data(), bias.Data(),
|
||||
runMean.Data(), runInvStdDev.Data(), GetStream());
|
||||
// CNTK engine returns saveMean and saveInvStdDev empty, but cnDNN engine does not.
|
||||
}
|
||||
}
|
||||
|
||||
// saveMean/saveInvStdDev are the interpolated mean/stddev as used in ForwardProp().
|
||||
// For blendFactor=1, they are not used and can be uninitialized or empty.
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale,
|
||||
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
|
||||
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
|
||||
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
|
||||
{
|
||||
|
@ -3192,8 +3264,9 @@ void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>&
|
|||
Call<ComputeScaleAndBiasGradients, ElemType>(vectorSize, vectorSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(),
|
||||
saveMean.Data(), saveInvStdDev.Data(), GetStream());
|
||||
}
|
||||
ElemType mbStatsWeight = (ElemType)(1 - blendFactor); // weight for contribution from actual MB stats (0 if none, e.g. locked BN node)
|
||||
Call<BackpropagateBatchNormGradients, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
|
||||
in.Data(), Data(), grad.Data(), scale.Data(), scaleGrad.Data(), biasGrad.Data(), saveMean.Data(), saveInvStdDev.Data(), GetStream());
|
||||
in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), saveMean.Data(), saveInvStdDev.Data(), GetStream());
|
||||
}
|
||||
|
||||
#pragma region Static BLAS Functions
|
||||
|
@ -3990,7 +4063,8 @@ ElemType GPUMatrix<ElemType>::GetLearnRateForBlock_Helper(const GPUMatrix<ElemTy
|
|||
}
|
||||
// d_res[0] should now contain inner product of matrices
|
||||
// Compute squared Frobenius norms (squared sums of elements)
|
||||
_lrHelper<ElemType><<<1, 512, 0, t_stream>>>(Gradients.Data(), SmoothedGradients.Data(), (CUDA_LONG) Gradients.GetNumElements(), d_res);
|
||||
// note: kernel has hard-coded dimension of 512
|
||||
_lrHelper512Threads<ElemType> << <1, 512, 0, t_stream >> >(Gradients.Data(), SmoothedGradients.Data(), (CUDA_LONG)Gradients.GetNumElements(), d_res);
|
||||
ElemType res;
|
||||
CUDA_CALL(cudaMemcpy(&res, d_res, sizeof(ElemType), cudaMemcpyDeviceToHost));
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(Gradients.GetComputeDeviceId(), d_res);
|
||||
|
@ -4214,16 +4288,21 @@ void GPUMatrix<ElemType>::RCRFBackwardCompute(
|
|||
ElemType* d_zeta = TracingGPUMemoryAllocator::Allocate<ElemType>(alpha.GetComputeDeviceId(), iNumLab);
|
||||
|
||||
CUDA_LONG N = iNumLab;
|
||||
int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock);
|
||||
// TODO: change all three '512' to 'GridDim::maxThreadsPerBlock' (not doing this now since I cannot test it)
|
||||
int blocksPerGrid = (int) ceil(1.0 * N / 512);
|
||||
size_t szMemSize;
|
||||
for (int t = iNumPos - 1; t >= 0; t--)
|
||||
{
|
||||
szMemSize = sizeof(ElemType) * iNumLab;
|
||||
_rcrfBackwardComputeZeta<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
|
||||
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
|
||||
assert(iNumLab <= 1024);
|
||||
_rcrfBackwardComputeZetaMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
|
||||
szMemSize = iNumLab * 3;
|
||||
szMemSize *= sizeof(ElemType);
|
||||
_rcrfBackwardCompute<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t, iNumPos, alpha.Data(), beta.Data(),
|
||||
d_zeta, pair_scores.Data(), iNumLab, shift);
|
||||
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == 3 * iNumLab.
|
||||
assert(iNumLab <= 1024);
|
||||
_rcrfBackwardComputeMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t, iNumPos, alpha.Data(), beta.Data(),
|
||||
d_zeta, pair_scores.Data(), iNumLab, shift);
|
||||
}
|
||||
/*
|
||||
error = cudaGetErrorString(cudaPeekAtLastError());
|
||||
|
@ -4255,16 +4334,22 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
|
|||
ElemType* d_zeta = TracingGPUMemoryAllocator::Allocate<ElemType>(alpha.GetComputeDeviceId(), iNumLab);
|
||||
|
||||
CUDA_LONG N = iNumLab;
|
||||
int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock);
|
||||
// TODO: change all three '512' to 'GridDim::maxThreadsPerBlock' (not doing this now since I cannot test it)
|
||||
int blocksPerGrid = (int)ceil(1.0 * N / 512);
|
||||
size_t szMemSize;
|
||||
for (int t = 0; t < iNumPos; t++)
|
||||
{
|
||||
szMemSize = sizeof(ElemType) * iNumLab;
|
||||
_rcrfTransGrdComputeZeta<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
|
||||
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
|
||||
assert(iNumLab <= 1024);
|
||||
// BUGBUG: This is launched with 512 threads per block, but allocates shared mem as if there is only one block. Likewise for all 4 of these functions.
|
||||
_rcrfTransGrdComputeZetaMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
|
||||
szMemSize = iNumLab * 3;
|
||||
szMemSize *= sizeof(ElemType);
|
||||
_rcrfTransGrdCompute<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t, startLbl, alpha.Data(), beta.Data(),
|
||||
d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
|
||||
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
|
||||
assert(iNumLab <= 1024);
|
||||
_rcrfTransGrdComputeMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t, startLbl, alpha.Data(), beta.Data(),
|
||||
d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
|
||||
}
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(alpha.GetComputeDeviceId(), d_zeta);
|
||||
};
|
||||
|
@ -4278,11 +4363,16 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
|
|||
template <class ElemType>
|
||||
static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE deviceId)
|
||||
{
|
||||
// using an array of shared_ptrs because those are thread-safe. The objects themselves are immutable.
|
||||
// And using a plain array so this will never get freed, avoiding free-after-DLL-unload issues.
|
||||
static shared_ptr<GPUMatrix<ElemType>> onesCache[32]; // cache of objects
|
||||
if (deviceId >= _countof(onesCache))
|
||||
LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", (int) _countof(onesCache), (int) deviceId + 1);
|
||||
// using a dynamically allocated array so this will never get freed, avoiding free-after-DLL-unload issues.
|
||||
// and using shared_ptrs since we don't want to leak more than CacheSize elements
|
||||
// when using a plain array we would have to control lifetime of the object and destructor would be called for every element in the array at the end
|
||||
const int CacheSize = 32;
|
||||
static shared_ptr<GPUMatrix<ElemType>> * onesCache = new shared_ptr<GPUMatrix<ElemType>>[CacheSize]; // cache of objects
|
||||
|
||||
if (deviceId >= CacheSize){
|
||||
LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", CacheSize, (int)deviceId + 1);
|
||||
}
|
||||
|
||||
auto p = onesCache[deviceId];
|
||||
if (!p || p->GetNumRows() < N) // must (re-)allocate
|
||||
{
|
||||
|
@ -4300,8 +4390,11 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
|
|||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
|
||||
{
|
||||
if (reductionOp != ElementWiseOperator::opSum) // TODO: enable the reduction ops
|
||||
InvalidArgument("TensorOp: Unary reduction operations other than opSum not yet implemented.");
|
||||
if (reductionOp != ElementWiseOperator::opSum &&
|
||||
reductionOp != ElementWiseOperator::opLogSum &&
|
||||
reductionOp != ElementWiseOperator::opMin &&
|
||||
reductionOp != ElementWiseOperator::opMax)
|
||||
InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
|
||||
|
||||
a.PrepareDevice();
|
||||
if (a.GetComputeDeviceId() != GetComputeDeviceId())
|
||||
|
@ -4322,10 +4415,11 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
|
|||
return LaunchUnaryTensorOp<ElemType>(beta, a.Data()+ offsets[0], Data()+ offsets[1], alpha, op, regularOpDims[0]);
|
||||
}
|
||||
|
||||
// special case: reducing a matrix onto a column vector; can be done with SGEMM
|
||||
// special case: sum-reducing a matrix onto a column vector; can be done with SGEMM
|
||||
// Note: A minor risk is that with this, our own reduction function will rarely be used.
|
||||
// That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
|
||||
else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation
|
||||
reductionOp == ElementWiseOperator::opSum &&
|
||||
#ifdef _DEBUG
|
||||
sizeof(ElemType) == sizeof(float) && // in debug don't shortcut 'double' so we have some test of our own codepath
|
||||
#endif
|
||||
|
@ -4348,7 +4442,7 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
|
|||
|
||||
// regular case
|
||||
else
|
||||
return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
|
||||
|
@ -4365,7 +4459,7 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
|
|||
if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId())
|
||||
InvalidArgument("All matrices must be on the same GPU");
|
||||
|
||||
return TensorOpN<ElemType, 3>(beta, array<ElemType*, 3>{a.Data(), b.Data(), Data()}, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpN<ElemType, 3>(beta, array<ElemType*, 3>{a.Data(), b.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
// perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
|
||||
|
@ -4381,7 +4475,7 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
|
|||
a.PrepareDevice();
|
||||
if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId() || c.GetComputeDeviceId() != GetComputeDeviceId())
|
||||
InvalidArgument("All matrices must be on the same GPU");
|
||||
return TensorOpN<ElemType, 4>(beta, array<ElemType*, 4>{a.Data(), b.Data(), c.Data(), Data()}, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpN<ElemType, 4>(beta, array<ElemType*, 4>{a.Data(), b.Data(), c.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
// =======================================================================
|
||||
|
@ -4420,24 +4514,50 @@ template void GPUMatrix<char>::SetValue(const size_t numRows, const size_t numCo
|
|||
template void GPUMatrix<char>::SetValue(GPUMatrix<char> const&);
|
||||
//template void GPUMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
|
||||
//template void GPUMatrix<char>::SetValue(GPUSparseMatrix<char> const&);
|
||||
|
||||
template void GPUMatrix<char>::CopySection(size_t numRows, size_t numCols, char* dst, size_t colStride) const;
|
||||
template void GPUMatrix<char>::Reshape(const size_t, const size_t);
|
||||
template GPUMatrix<char>& GPUMatrix<char>::operator*=(char);
|
||||
template DEVICEID_TYPE GPUMatrix<char>::PrepareDevice(DEVICEID_TYPE deviceId) const;
|
||||
|
||||
// Support <short>
|
||||
template GPUMatrix<short>::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId);
|
||||
template GPUMatrix<short>::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId, short* pArray, const size_t matrixFlags);
|
||||
template GPUMatrix<short>::GPUMatrix(const GPUMatrix<short>&);
|
||||
template GPUMatrix<short>::GPUMatrix(GPUMatrix<short>&&);
|
||||
template short* GPUMatrix<short>::CopyToArray() const;
|
||||
template void GPUMatrix<short>::ChangeDeviceTo(int);
|
||||
template void GPUMatrix<short>::Resize(size_t, size_t, bool);
|
||||
template void GPUMatrix<short>::RequireSize(size_t, size_t, bool);
|
||||
|
||||
template GPUMatrix<short>::~GPUMatrix();
|
||||
template GPUMatrix<short> GPUMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
|
||||
template GPUMatrix<short>& GPUMatrix<short>::operator=(GPUMatrix<short>&&);
|
||||
template GPUMatrix<short>::GPUMatrix(int);
|
||||
template void GPUMatrix<short>::SetValue(const short);
|
||||
template void GPUMatrix<short>::SetValue(const size_t numRows, const size_t numCols, int deviceId, short* pArray, size_t matrixFlags);
|
||||
//template void GPUMatrix<short>::SetValue(CPUMatrix<short> const&);
|
||||
template void GPUMatrix<short>::SetValue(GPUMatrix<short> const&);
|
||||
//template void GPUMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
|
||||
//template void GPUMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
|
||||
template void GPUMatrix<short>::CopySection(size_t numRows, size_t numCols, short* dst, size_t colStride) const;
|
||||
template void GPUMatrix<short>::Reshape(const size_t, const size_t);
|
||||
template GPUMatrix<short>& GPUMatrix<short>::operator*=(short);
|
||||
template DEVICEID_TYPE GPUMatrix<short>::PrepareDevice(DEVICEID_TYPE deviceId) const;
|
||||
|
||||
template GPUMatrix<int>::GPUMatrix(const size_t, const size_t, int, int*, const size_t);
|
||||
template GPUMatrix<int>::~GPUMatrix();
|
||||
|
||||
template int* TracingGPUMemoryAllocator::Allocate<int>(int, size_t);
|
||||
template size_t* TracingGPUMemoryAllocator::Allocate<size_t>(int, size_t);
|
||||
template long* TracingGPUMemoryAllocator::Allocate<long>(int, size_t);
|
||||
template short* TracingGPUMemoryAllocator::Allocate<short>(int, size_t);
|
||||
template char* TracingGPUMemoryAllocator::Allocate<char>(int, size_t);
|
||||
template float* TracingGPUMemoryAllocator::Allocate<float>(int, size_t);
|
||||
template double* TracingGPUMemoryAllocator::Allocate<double>(int, size_t);
|
||||
|
||||
template void TracingGPUMemoryAllocator::Free<int>(int, int*, bool);
|
||||
template void TracingGPUMemoryAllocator::Free<size_t>(int, size_t*, bool);
|
||||
template void TracingGPUMemoryAllocator::Free<short>(int, short*, bool);
|
||||
template void TracingGPUMemoryAllocator::Free<char>(int, char*, bool);
|
||||
template void TracingGPUMemoryAllocator::Free<float>(int, float*, bool);
|
||||
template void TracingGPUMemoryAllocator::Free<double>(int, double*, bool);
|
||||
|
|
|
@ -61,6 +61,27 @@ cudaStream_t MATH_API GetStream();
|
|||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// SyncGuard -- synchronize around CUDA calls
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
class SyncGuard
|
||||
{
|
||||
private:
|
||||
static bool s_isSyncEnabled;
|
||||
|
||||
bool m_forceSync;
|
||||
#ifndef CPUONLY
|
||||
cudaEvent_t m_done;
|
||||
#endif
|
||||
|
||||
public:
|
||||
static MATH_API void EnableSync();
|
||||
|
||||
SyncGuard(bool forceSync = false);
|
||||
~SyncGuard();
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// DeviceBoundNumber -- This class represents a number which resides on a particular device. Use it to avoid unnecessary transfers between CPU and GPU
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -207,18 +228,14 @@ public:
|
|||
// multiple views, RequireSize will first check to see if Resize is required. If it is not, then it short-circuits and is a noop. Otherwise, RequireSize
|
||||
// will call Resize, which may fail if the matrix has multiple views.
|
||||
void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
|
||||
void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly); }
|
||||
|
||||
// Resize first checks to ensure that the caller has the authority to call Resize (i.e., it checks to ensure the underlying data is owned by only this matrix), and then
|
||||
// actually resizes the underlying matrix, doing any allocation as required.
|
||||
void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
|
||||
|
||||
ElemType& operator()(const size_t /*row*/, const size_t /*col*/)
|
||||
{
|
||||
LogicError("GPUMatrix doesn't support this");
|
||||
}
|
||||
const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const
|
||||
{
|
||||
LogicError("GPUMatrix doesn't support this");
|
||||
}
|
||||
ElemType& operator()(const size_t /*row*/, const size_t /*col*/) { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
|
||||
const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
|
||||
ElemType Get00Element() const;
|
||||
|
||||
void SetValue(const ElemType v);
|
||||
|
@ -453,7 +470,8 @@ public:
|
|||
void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
|
||||
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const;
|
||||
void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
|
||||
void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
|
||||
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
|
||||
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const;
|
||||
|
||||
public:
|
||||
|
@ -623,51 +641,4 @@ static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libNam
|
|||
#define CURAND_CALL(expr) (CudaCall((expr), #expr, "CURAND", CURAND_STATUS_SUCCESS))
|
||||
#define CUDNN_CALL(expr) (CudaCall((expr), #expr, "cuDNN", CUDNN_STATUS_SUCCESS))
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// SyncGuard -- synchronize around CUDA calls
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
class SyncGuard
|
||||
{
|
||||
static bool DoSync()
|
||||
{
|
||||
#ifdef NO_SYNC // this strange way of writing it allows modifying this variable at runtime in the debugger
|
||||
static bool do_sync = false;
|
||||
#else
|
||||
static bool do_sync = true;
|
||||
#endif
|
||||
return do_sync;
|
||||
}
|
||||
cudaEvent_t m_done;
|
||||
public:
|
||||
SyncGuard()
|
||||
{
|
||||
m_done = nullptr;
|
||||
if (DoSync())
|
||||
{
|
||||
CUDA_CALL(cudaGetLastError());
|
||||
CUDA_CALL(cudaEventCreate(&m_done));
|
||||
}
|
||||
}
|
||||
~SyncGuard()
|
||||
{
|
||||
if (DoSync())
|
||||
{
|
||||
// The regular use of this destructor is to synchronize the GPU, but also
|
||||
// to check for errors. So this destructor is where CUDA errors would be thrown.
|
||||
// If this destructor runs during stack unwinding, then a different error has
|
||||
// already happened that should be reported; so we only clean up the resource.
|
||||
if (std::uncaught_exception())
|
||||
cudaEventDestroy(m_done);
|
||||
else
|
||||
{
|
||||
// failures in a prior launch might be reported here
|
||||
CUDA_CALL(cudaEventRecord(m_done));
|
||||
CUDA_CALL(cudaEventSynchronize(m_done));
|
||||
CUDA_CALL(cudaEventDestroy(m_done));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CPUONLY
|
||||
|
|
|
@ -95,8 +95,8 @@ static INT CeilDiv(INT a, INT2 b) // ceil(a/b)
|
|||
|
||||
struct GridDim
|
||||
{
|
||||
static const CUDA_LONG maxThreadsPerBlock = 512; // use this many threads per block
|
||||
static const CUDA_LONG maxWarpsPerBlock = 16; // use this many warps per block. This means 512 threads for warpSize=32
|
||||
static const CUDA_LONG maxThreadsPerBlock = 1024; // use this many threads per block
|
||||
static const CUDA_LONG maxWarpsPerBlock = 32; // use this many warps per block. This means 1024 threads for warpSize=32
|
||||
|
||||
// use these for launching
|
||||
// GridDim grid(NN);
|
||||
|
@ -127,7 +127,7 @@ struct GridDim
|
|||
}
|
||||
|
||||
// put it back together
|
||||
m_threadsPerBlock = warpsPerProc * warpSize; // =a multiple of 32 that is as close to 512 as makes sense given NN
|
||||
m_threadsPerBlock = warpsPerProc * warpSize; // =a multiple of 32 that is as close to 1024 as makes sense given NN
|
||||
m_blocksPerGrid = CeilDiv(N, m_threadsPerBlock);
|
||||
if (m_blocksPerGrid == 1)
|
||||
m_threadsPerBlock = N; // don't launch more than necessary --TODO: Does this make a difference at all?
|
||||
|
@ -847,7 +847,7 @@ __global__ void _logSoftMaxColWise(
|
|||
|
||||
// each block processes one column. There must be 512 threads in a block
|
||||
template <class ElemType>
|
||||
__global__ void _assignColumnwiseLogSoftmaxOf(
|
||||
__global__ void _assignColumnwiseLogSoftmaxOf512Threads(
|
||||
const ElemType* a,
|
||||
ElemType* us,
|
||||
const CUDA_LONG m_numCols,
|
||||
|
@ -1015,7 +1015,7 @@ __global__ void _logSoftMaxRowWise(
|
|||
|
||||
// each block processes one column. There must be 512 threads in a block
|
||||
template <class ElemType>
|
||||
__global__ void _assignColumnwiseHardmaxOf(
|
||||
__global__ void _assignColumnwiseHardmaxOf512Threads(
|
||||
const ElemType* a,
|
||||
ElemType* us,
|
||||
const CUDA_LONG m_numCols,
|
||||
|
@ -2198,7 +2198,7 @@ __global__ void _addSignOf(
|
|||
|
||||
// This function processes 1 column per block. this function needs 512 threads
|
||||
template <class ElemType, bool IsMax>
|
||||
__global__ void _vectorMaxMinReduce(
|
||||
__global__ void _vectorMaxMinReduce512Threads(
|
||||
const ElemType* us,
|
||||
ElemType* Indexes,
|
||||
ElemType* Values,
|
||||
|
@ -2585,7 +2585,7 @@ __global__ void _addElementToElement(
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _assignNumOfDiff(
|
||||
__global__ void _assignNumOfDiff1024Threads(
|
||||
const ElemType* a,
|
||||
const ElemType* b,
|
||||
ElemType* c,
|
||||
|
@ -2664,7 +2664,7 @@ __global__ void _assignNumOfDiff(
|
|||
}
|
||||
|
||||
/*template<class ElemType>
|
||||
__global__ void _assignNumOfDiff(
|
||||
__global__ void _assignNumOfDiff1024Threads(
|
||||
ElemType *a,
|
||||
ElemType *b,
|
||||
ElemType *c,
|
||||
|
@ -3343,8 +3343,9 @@ __global__ void _computeGradientOfInput(
|
|||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
template <class ElemType>
|
||||
__global__ void computeNCEForwardProp(
|
||||
__global__ void computeNCEForwardProp512Threads(
|
||||
const ElemType* val,
|
||||
const int* col,
|
||||
int numRows,
|
||||
|
@ -3406,9 +3407,10 @@ __global__ void computeNCEForwardProp(
|
|||
res[i] = partials[0];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _computeNceOutput(
|
||||
__global__ void _computeNceOutputMax512Threads(
|
||||
const ElemType* col,
|
||||
int numRows,
|
||||
int sampleCount,
|
||||
|
@ -3477,7 +3479,7 @@ __global__ void _computeNceOutput(
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _assignSoftmaxSum(
|
||||
__global__ void _assignSoftmaxSumMax512Threads(
|
||||
const ElemType* softmax,
|
||||
int sampleCount,
|
||||
const ElemType* a,
|
||||
|
@ -3489,7 +3491,7 @@ __global__ void _assignSoftmaxSum(
|
|||
// col is an array contains index of the word samples
|
||||
// a is a matrix in column major format contains output from hidden layer
|
||||
// b is the weight matrix for output layer
|
||||
// tmp is the buffer that stores NCE output calculated from _computeNceOutput
|
||||
// tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
|
||||
// c is the matrix to store objective
|
||||
|
||||
__shared__ ElemType partials[512];
|
||||
|
@ -3529,7 +3531,7 @@ __global__ void _assignSoftmaxSum(
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _assignNoiseContrastiveEstimation(
|
||||
__global__ void _assignNoiseContrastiveEstimationMax512Threads(
|
||||
const ElemType* val,
|
||||
int numRows,
|
||||
int sampleCount,
|
||||
|
@ -3545,7 +3547,7 @@ __global__ void _assignNoiseContrastiveEstimation(
|
|||
// col is an array contains index of the word samples
|
||||
// a is a matrix in column major format contains output from hidden layer
|
||||
// b is the weight matrix for output layer
|
||||
// tmp is the buffer that stores NCE output calculated from _computeNceOutput
|
||||
// tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
|
||||
// c is the matrix to store objective
|
||||
|
||||
__shared__ ElemType partials[512];
|
||||
|
@ -3863,7 +3865,7 @@ __global__ void _normalGradForSparseBlock(
|
|||
//This function should be called with 1024 threads per block and 1 block
|
||||
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
|
||||
template <class ElemType>
|
||||
__global__ void _reductionSum(
|
||||
__global__ void _reductionSum1024Threads(
|
||||
const ElemType* data,
|
||||
ElemType* sum,
|
||||
CUDA_LONG N)
|
||||
|
@ -3944,7 +3946,7 @@ __global__ void _reductionSum(
|
|||
//This function should be called with 1024 threads per block and 1 block
|
||||
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
|
||||
template <class ElemType>
|
||||
__global__ void _reductionSumAndAssign(
|
||||
__global__ void _reductionSumAndAssign1024Threads(
|
||||
ElemType* toAssign,
|
||||
const ElemType* data,
|
||||
CUDA_LONG N, // length of data
|
||||
|
@ -4028,7 +4030,7 @@ __global__ void _reductionSumAndAssign(
|
|||
//This function should be called with 1024 threads per block and 1 block
|
||||
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
|
||||
template <class ElemType>
|
||||
__global__ void _reductionSum2(
|
||||
__global__ void _reductionSum21024Threads(
|
||||
const ElemType* data,
|
||||
ElemType* sum,
|
||||
CUDA_LONG N,
|
||||
|
@ -4118,7 +4120,7 @@ __global__ void _reductionSum2(
|
|||
//This function should be called with 1024 threads per block and 1 block
|
||||
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
|
||||
template <class ElemType>
|
||||
__global__ void _reductionMatrixNormInf(
|
||||
__global__ void _reductionMatrixNormInf1024Threads(
|
||||
const ElemType* data,
|
||||
ElemType* maxAbs,
|
||||
CUDA_LONG N)
|
||||
|
@ -4206,7 +4208,7 @@ __global__ void _reductionMatrixNormInf(
|
|||
//This function should be called with 1024 threads per block and 1 block
|
||||
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
|
||||
template <class ElemType>
|
||||
__global__ void _reductionMatrixNorm0(
|
||||
__global__ void _reductionMatrixNorm01024Threads(
|
||||
const ElemType* data,
|
||||
ElemType* nz,
|
||||
CUDA_LONG N)
|
||||
|
@ -4306,7 +4308,7 @@ __global__ void _getSparseVectorRepresntationForCSCMatrix(
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _lrHelper(
|
||||
__global__ void _lrHelper512Threads(
|
||||
const ElemType* data1,
|
||||
const ElemType* data2,
|
||||
const CUDA_LONG N,
|
||||
|
@ -4408,7 +4410,7 @@ __global__ void _lrHelper(
|
|||
|
||||
/*
|
||||
template<class ElemType>
|
||||
__global__ void _lrHelper(
|
||||
__global__ void _lrHelper512Threads(
|
||||
ElemType* d_tmp)
|
||||
{
|
||||
if (sizeof(ElemType)==sizeof(float))
|
||||
|
@ -4572,83 +4574,11 @@ __global__ void _minusOneAt(
|
|||
c[id] = c[id] - 1.0;
|
||||
}
|
||||
|
||||
// the kernel function for RCRF backward computation
|
||||
// the kernel function for CRFLSTMNetwork backward computation
|
||||
// assume a column slice of input and output
|
||||
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == 3 * iNumLab.
|
||||
template <class ElemType>
|
||||
__global__ void _rcrfBackwardCompute(
|
||||
const size_t iNumPos,
|
||||
const ElemType* galpha, // column slice at current time t
|
||||
ElemType* gbeta, // column slices with [row, 2] at current time t for [
|
||||
const ElemType* gpair_scores,
|
||||
const size_t iNumLab, const int shift)
|
||||
{
|
||||
int id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
|
||||
// need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
|
||||
|
||||
ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
|
||||
ElemType* pair_scores = alpha + iNumPos * iNumLab;
|
||||
ElemType* beta = alpha + iNumPos * iNumLab + iNumLab * iNumLab;
|
||||
|
||||
if (id < 0 || id >= iNumLab)
|
||||
return;
|
||||
|
||||
// copy global memory to shared memory to save time
|
||||
for (int t = iNumPos - 1; t >= 0; t--)
|
||||
{
|
||||
alpha[IDX2C(id, t, iNumLab)] = galpha[IDX2C(id, t, iNumLab)];
|
||||
}
|
||||
|
||||
for (int j = 0; j < iNumLab; j++)
|
||||
pair_scores[IDX2C(id, j, iNumLab)] = gpair_scores[IDX2C(id, j, iNumLab)];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int t = iNumPos - 1; t >= 0; t--)
|
||||
{
|
||||
ElemType fSum;
|
||||
ElemType fTmp = LZERO;
|
||||
if (t == iNumPos - 1)
|
||||
{
|
||||
fSum = LZERO;
|
||||
for (int j = 0; j < iNumLab; j++)
|
||||
{
|
||||
fSum = logaddk(fSum, alpha[IDX2C(j, t, iNumLab)]);
|
||||
}
|
||||
|
||||
fTmp = alpha[IDX2C(id, t, iNumLab)] - fSum;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = 0; j < iNumLab; j++)
|
||||
{
|
||||
fSum = LZERO;
|
||||
for (int m = 0; m < iNumLab; m++)
|
||||
{
|
||||
fSum = logaddk(fSum, alpha[IDX2C(m, t, iNumLab)] + pair_scores[IDX2C(j, m, iNumLab)]);
|
||||
}
|
||||
|
||||
fTmp = logaddk(fTmp, beta[IDX2C(j, t + 1, iNumLab)] + alpha[IDX2C(id, t, iNumLab)] + pair_scores[IDX2C(j, id, iNumLab)] - fSum);
|
||||
}
|
||||
}
|
||||
|
||||
beta[IDX2C(id, t, iNumLab)] = fTmp;
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// copy from shared memory to global memory to pass values
|
||||
for (int t = iNumPos - 1; t >= 0; t--)
|
||||
{
|
||||
gbeta[IDX2C(id, t, iNumLab)] = beta[IDX2C(id, t, iNumLab)];
|
||||
}
|
||||
// __syncthreads();
|
||||
}
|
||||
|
||||
/// the kernel function for CRFLSTMNetwork backward computation
|
||||
/// assume a column slice of input and output
|
||||
template <class ElemType>
|
||||
__global__ void _rcrfBackwardCompute(
|
||||
__global__ void _rcrfBackwardComputeMax1024Labels(
|
||||
const size_t t, // time position
|
||||
const size_t iNumPos,
|
||||
const ElemType* galpha, // column slice at current time t
|
||||
|
@ -4659,13 +4589,13 @@ __global__ void _rcrfBackwardCompute(
|
|||
{
|
||||
int id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
|
||||
// need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
|
||||
extern __shared__ double sh_alpha_and_beta[]; // [id] or [id + iNumLab] or [id + 2 * iNumLab)]
|
||||
// need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
|
||||
|
||||
ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
|
||||
ElemType* beta_t1 = (ElemType*) (alpha + iNumLab);
|
||||
ElemType* zeta = (ElemType*) (beta_t1 + iNumLab);
|
||||
ElemType pair_scores[1024];
|
||||
ElemType pair_scores[1024]; // [j=0..iNumLab-1]
|
||||
|
||||
if (id < 0 || id >= iNumLab)
|
||||
return;
|
||||
|
@ -4697,9 +4627,10 @@ __global__ void _rcrfBackwardCompute(
|
|||
gbeta[IDX2C(id, t, iNumLab)] = fTmp;
|
||||
}
|
||||
|
||||
/// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
|
||||
// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
|
||||
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
|
||||
template <class ElemType>
|
||||
__global__ void _rcrfBackwardComputeZeta(
|
||||
__global__ void _rcrfBackwardComputeZetaMax1024Labels(
|
||||
const size_t t, // time position
|
||||
const size_t iNumPos,
|
||||
const ElemType* galpha, // column slice at current time t
|
||||
|
@ -4709,11 +4640,11 @@ __global__ void _rcrfBackwardComputeZeta(
|
|||
{
|
||||
int id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
|
||||
// need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
|
||||
extern __shared__ double sh_alpha_and_beta[]; // [id]
|
||||
// need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
|
||||
|
||||
ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
|
||||
ElemType pair_scores[1024];
|
||||
ElemType pair_scores[1024]; // [j=0..iNumLab-1]
|
||||
|
||||
if (id < 0 || id >= iNumLab)
|
||||
return;
|
||||
|
@ -4739,8 +4670,9 @@ __global__ void _rcrfBackwardComputeZeta(
|
|||
}
|
||||
|
||||
/// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
|
||||
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
|
||||
template <class ElemType>
|
||||
__global__ void _rcrfTransGrdComputeZeta(
|
||||
__global__ void _rcrfTransGrdComputeZetaMax1024Labels(
|
||||
const int t, // time position
|
||||
const size_t iNumPos,
|
||||
const ElemType* galpha, // column slice at current time t
|
||||
|
@ -4752,11 +4684,11 @@ __global__ void _rcrfTransGrdComputeZeta(
|
|||
{
|
||||
int id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
|
||||
// need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
|
||||
extern __shared__ double sh_alpha_and_beta[]; // [id]
|
||||
// need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
|
||||
|
||||
ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
|
||||
ElemType pair_scores[1024];
|
||||
ElemType pair_scores[1024]; // [j=0..iNumLab-1]
|
||||
|
||||
if (id < 0 || id >= iNumLab)
|
||||
return;
|
||||
|
@ -4790,8 +4722,9 @@ __global__ void _rcrfTransGrdComputeZeta(
|
|||
gzeta[id] = fSum;
|
||||
}
|
||||
|
||||
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
|
||||
template <class ElemType>
|
||||
__global__ void _rcrfTransGrdCompute(
|
||||
__global__ void _rcrfTransGrdComputeMax1024Labels(
|
||||
int t,
|
||||
const size_t start_lbl,
|
||||
const ElemType* galpha,
|
||||
|
@ -4806,13 +4739,13 @@ __global__ void _rcrfTransGrdCompute(
|
|||
{
|
||||
int id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
|
||||
// need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
|
||||
extern __shared__ double sh_alpha_and_beta[]; // [id]
|
||||
// need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
|
||||
|
||||
ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
|
||||
ElemType* beta = (ElemType*) (alpha + iNumLab);
|
||||
ElemType* zeta = (ElemType*) (beta + iNumLab);
|
||||
ElemType pair_scores[1024];
|
||||
ElemType pair_scores[1024]; // [j=0..iNumLab-1]
|
||||
|
||||
if (id < 0 || id >= iNumLab)
|
||||
return;
|
||||
|
|
|
@ -2290,7 +2290,7 @@ ElemType GPUSparseMatrix<ElemType>::SumOfElements() const
|
|||
ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
|
||||
ElemType h_sum;
|
||||
// WARNING: THIS kernel is not the most efficient way!
|
||||
_reductionSum<ElemType><<<1, 1024>>>(NzValues(), d_sum, (LONG64) GetNumNZElements());
|
||||
_reductionSum1024Threads<ElemType><<<1, 1024>>>(NzValues(), d_sum, (LONG64) GetNumNZElements());
|
||||
CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);
|
||||
|
||||
|
@ -2307,7 +2307,7 @@ ElemType GPUSparseMatrix<ElemType>::FrobeniusNorm() const
|
|||
ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
|
||||
ElemType h_sum = 0;
|
||||
// WARNING: THIS kernel is not the most efficient way!
|
||||
_reductionSum2<ElemType><<<1, 1024>>>(NzValues(), d_sum, (int) GetNumNZElements());
|
||||
_reductionSum21024Threads<ElemType><<<1, 1024>>>(NzValues(), d_sum, (int) GetNumNZElements());
|
||||
CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);
|
||||
|
||||
|
@ -2326,7 +2326,7 @@ ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
|
|||
ElemType* d_maxAbs = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
|
||||
ElemType h_maxAbs = 0;
|
||||
// WARNING: THIS kernel is not the most efficient way!
|
||||
_reductionMatrixNormInf<ElemType><<<1, 1024>>>(NzValues(), d_maxAbs, (int) GetNumNZElements());
|
||||
_reductionMatrixNormInf1024Threads<ElemType><<<1, 1024>>>(NzValues(), d_maxAbs, (int) GetNumNZElements());
|
||||
CUDA_CALL(cudaMemcpy(&h_maxAbs, d_maxAbs, sizeof(ElemType), cudaMemcpyDeviceToHost));
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_maxAbs);
|
||||
|
||||
|
@ -2689,7 +2689,6 @@ template void GPUSparseMatrix<char>::CopyToCPUSparseMatrix(CPUSparseMatrix<char>
|
|||
template void GPUSparseMatrix<char>::ChangeDeviceTo(int);
|
||||
template void GPUSparseMatrix<char>::Resize(const size_t, const size_t, const size_t, const bool);
|
||||
template void GPUSparseMatrix<char>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);
|
||||
template void GPUSparseMatrix<int>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);
|
||||
template void GPUSparseMatrix<char>::Reset();
|
||||
template GPUSPARSE_INDEX_TYPE GPUSparseMatrix<char>::SecondaryIndexValueAt(size_t) const;
|
||||
template GPUSparseMatrix<char>::~GPUSparseMatrix();
|
||||
|
@ -2699,8 +2698,32 @@ template GPUSparseMatrix<char>& GPUSparseMatrix<char>::operator=(GPUSparseMatrix
|
|||
template void GPUSparseMatrix<char>::Reshape(const size_t, const size_t);
|
||||
template void GPUSparseMatrix<char>::ScaleAndAdd(char, GPUSparseMatrix<char> const &, GPUMatrix<char> &);
|
||||
|
||||
// Support <short>
|
||||
template GPUSparseMatrix<short>::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat);
|
||||
template GPUSparseMatrix<short>::GPUSparseMatrix(const size_t, const size_t, const size_t, DEVICEID_TYPE, const MatrixFormat);
|
||||
template GPUSparseMatrix<short>::GPUSparseMatrix(GPUSparseMatrix<short> const&);
|
||||
template GPUSparseMatrix<short>::GPUSparseMatrix(GPUSparseMatrix<short>&&);
|
||||
template void GPUSparseMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
|
||||
template void GPUSparseMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
|
||||
template void GPUSparseMatrix<short>::SetValue(GPUMatrix<short> const&);
|
||||
//template void GPUSparseMatrix<short>::SetValue(CPUMatrix<short> const&);
|
||||
template void GPUSparseMatrix<short>::CopyToDenseMatrix(GPUMatrix<short>&) const;
|
||||
template void GPUSparseMatrix<short>::CopyToCPUSparseMatrix(CPUSparseMatrix<short>&) const;
|
||||
template void GPUSparseMatrix<short>::ChangeDeviceTo(int);
|
||||
template void GPUSparseMatrix<short>::Resize(const size_t, const size_t, const size_t, const bool);
|
||||
template void GPUSparseMatrix<short>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);
|
||||
template void GPUSparseMatrix<short>::Reset();
|
||||
template GPUSPARSE_INDEX_TYPE GPUSparseMatrix<short>::SecondaryIndexValueAt(size_t) const;
|
||||
template GPUSparseMatrix<short>::~GPUSparseMatrix();
|
||||
template GPUSparseMatrix<short> GPUSparseMatrix<short>::ColumnSlice(size_t, size_t) const;
|
||||
template GPUMatrix<short> GPUSparseMatrix<short>::CopyColumnSliceToDense(size_t, size_t) const;
|
||||
template GPUSparseMatrix<short>& GPUSparseMatrix<short>::operator=(GPUSparseMatrix<short>&&);
|
||||
template void GPUSparseMatrix<short>::Reshape(const size_t, const size_t);
|
||||
template void GPUSparseMatrix<short>::ScaleAndAdd(short, GPUSparseMatrix<short> const &, GPUMatrix<short> &);
|
||||
|
||||
template GPUSparseMatrix<int>::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat);
|
||||
template GPUSparseMatrix<int>::~GPUSparseMatrix();
|
||||
template void GPUSparseMatrix<int>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);
|
||||
|
||||
template <class ElemType>
|
||||
MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include "cublas_v2.h"
|
||||
#include <assert.h>
|
||||
#include<limits.h>
|
||||
|
||||
#ifndef let
|
||||
#define let const auto
|
||||
|
@ -47,9 +48,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// - supports general strides
|
||||
// - input broadcasting is supported by stride=0
|
||||
// - the operation is denoted by an opCode
|
||||
// - reduction is supported, including summation (dual to broadcasting when computing gradients)
|
||||
// - reduction operation is given by an opCode. Only a few specific opCodes may be used for reduction.
|
||||
// Note: reduction opCodes are not implemented yet, only summation is supported.
|
||||
// - reduction is supported, including summation, min, max (dual to broadcasting when computing gradients)
|
||||
// - reduction operation is given by an opCode: opSum, opMin, opMax and opLogSum.
|
||||
//
|
||||
// This library makes extensive use of templates and macros.
|
||||
// Specifically, templates are used recursively to recurse over tensor dimensions.
|
||||
|
@ -261,6 +261,65 @@ struct TensorOps
|
|||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// For reductions we need the neutral elements of the corresponding binary ops
|
||||
//----------------------------------------------------------------------------
|
||||
template <typename ElemType> __device__ ElemType NeutralValue(ElementWiseOperator op)
|
||||
{
|
||||
return 0; // error, only the explicit instantiations below should be used.
|
||||
};
|
||||
|
||||
template<> __device__ float NeutralValue<float>(ElementWiseOperator op)
|
||||
{
|
||||
switch (op)
|
||||
{
|
||||
case ElementWiseOperator::opSum: return 0;
|
||||
case ElementWiseOperator::opLogSum: return -INFINITY;
|
||||
case ElementWiseOperator::opMin: return FLT_MAX;
|
||||
case ElementWiseOperator::opMax: return FLT_MIN;
|
||||
default: return 0; // error
|
||||
}
|
||||
};
|
||||
|
||||
template<> __device__ double NeutralValue<double>(ElementWiseOperator op)
|
||||
{
|
||||
switch (op)
|
||||
{
|
||||
case ElementWiseOperator::opSum: return 0;
|
||||
case ElementWiseOperator::opLogSum: return -INFINITY;
|
||||
case ElementWiseOperator::opMin: return DBL_MAX;
|
||||
case ElementWiseOperator::opMax: return DBL_MIN;
|
||||
default: return 0; // error
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Function to update an aggregate value for the specifed reduction operation
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
template<typename ReductionType, class ElemType> __device__ void UpdateAggregate(ReductionType& aggregate, ElemType val, ElementWiseOperator reductionOp)
|
||||
{
|
||||
switch (reductionOp)
|
||||
{
|
||||
case ElementWiseOperator::opSum:
|
||||
aggregate += val;
|
||||
break;
|
||||
case ElementWiseOperator::opLogSum:
|
||||
aggregate = OpLogSum(aggregate, val);
|
||||
break;
|
||||
case ElementWiseOperator::opMin:
|
||||
if (val < aggregate)
|
||||
aggregate = val;
|
||||
break;
|
||||
case ElementWiseOperator::opMax:
|
||||
if (val > aggregate)
|
||||
aggregate = val;
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// function to compute the value for a given output location (including reduction)
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -272,12 +331,12 @@ template <class ElemType, C_size_t N, C_int M, C_int m>
|
|||
struct TensorOpReduce
|
||||
{
|
||||
// this version for m >= 0
|
||||
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
|
||||
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides)
|
||||
{
|
||||
// start with index 0
|
||||
// We may use 'double' since we are memory-bound anyway.
|
||||
ReduceElemType aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
|
||||
ReduceElemType aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reductionOp, reducingOpDims, reducingStrides);
|
||||
// apply this index to the pointers
|
||||
C_size_t dim = reducingOpDims[m];
|
||||
for (C_size_t k = 1 /*done with k=0 already*/; k < dim; k++)
|
||||
|
@ -285,8 +344,8 @@ struct TensorOpReduce
|
|||
// bump the pointers
|
||||
for (C_size_t i = 0; i < N - 1; i++) // N-1 because output is not used here
|
||||
pointers[i] += reducingStrides(i, (C_size_t) m);
|
||||
ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
|
||||
aggregate += val;
|
||||
ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reductionOp, reducingOpDims, reducingStrides);
|
||||
UpdateAggregate<ReduceElemType, ElemType>(aggregate, val, reductionOp);
|
||||
}
|
||||
return (ElemType) aggregate;
|
||||
}
|
||||
|
@ -299,7 +358,7 @@ struct TensorOpReduce<ElemType, N, M, /*m=*/-1>
|
|||
{
|
||||
// this version for m = -1
|
||||
// the pointers are pointing to the right location(s) to take the operation over
|
||||
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
|
||||
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const FixedArray<C_unsigned_int, M>& /*reducingOpDims*/, const FixedMatrix<C_int, N, M>& /*reducingStrides*/)
|
||||
{
|
||||
return TensorOps<ElemType>::Compute(pointers, op); // finally computing something!
|
||||
|
@ -354,7 +413,7 @@ template <class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce, C_i
|
|||
struct TensorOpElement
|
||||
{
|
||||
// template-recursive version loops over indices
|
||||
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
|
||||
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const FixedArray<C_unsigned_int, K>& regularOpStrides, const FixedMatrix<C_int, N, K>& regularStrides,
|
||||
const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides,
|
||||
CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
|
||||
|
@ -367,7 +426,7 @@ struct TensorOpElement
|
|||
for (C_size_t i = 0; i < N; i++)
|
||||
pointers[i] += index * regularStrides(i, (C_size_t) k); // now this dimension is taken care of
|
||||
// process the previous index
|
||||
TensorOpElement<ElemType, N, M, K, parallelReduce, k - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
|
||||
TensorOpElement<ElemType, N, M, K, parallelReduce, k - 1>::Compute(id, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -376,7 +435,7 @@ template <class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce>
|
|||
struct TensorOpElement<ElemType, N, M, K, parallelReduce, /*k=*/0>
|
||||
{
|
||||
// template-recursive version loops over indices
|
||||
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
|
||||
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const FixedArray<C_unsigned_int, K>& regularOpStrides, const FixedMatrix<C_int, N, K>& regularStrides,
|
||||
const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides,
|
||||
CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
|
||||
|
@ -387,7 +446,7 @@ struct TensorOpElement<ElemType, N, M, K, parallelReduce, /*k=*/0>
|
|||
for (C_size_t i = 0; i < N; i++)
|
||||
pointers[i] += index * regularStrides(i, 0); // now this dimension is taken care of
|
||||
// process the previous index
|
||||
TensorOpElement<ElemType, N, M, K, parallelReduce, -1>::Compute(/*id*/ 0, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
|
||||
TensorOpElement<ElemType, N, M, K, parallelReduce, -1>::Compute(/*id*/ 0, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -397,13 +456,13 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/false, /*k=*/-1>
|
|||
{
|
||||
// template-recursion-teminating version computes the actual value for this output location
|
||||
// now the output pointers point to the right element (input pointers may still iterate for reduction)
|
||||
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
|
||||
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const FixedArray<C_unsigned_int, K>& /*regularOpStrides*/, const FixedMatrix<C_int, N, K>& /*regularStrides*/,
|
||||
const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides, CUDA_LONG /*reductionBegin*/, CUDA_LONG /*reductionChunkSize*/)
|
||||
{
|
||||
// compute the operation for this output coordinate
|
||||
// This may still involve a reduction over inverse-broadcasting dimensions.
|
||||
ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
|
||||
ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reductionOp, reducingOpDims, reducingStrides);
|
||||
// scale
|
||||
val *= alpha;
|
||||
// combine with previous value in target matrix, then write it out
|
||||
|
@ -423,7 +482,7 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
|
|||
{
|
||||
// template-recursion-teminating version computes the actual value for this output location
|
||||
// now the output pointers point to the right element (input pointers may still iterate for reduction)
|
||||
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
|
||||
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const FixedArray<C_unsigned_int, K>& /*regularOpStrides*/, const FixedMatrix<C_int, N, K>& /*regularStrides*/,
|
||||
const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides, CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
|
||||
{
|
||||
|
@ -442,22 +501,24 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
|
|||
CUDA_LONG reductionEnd = min(reductionBegin + reductionChunkSize, reductionDim);
|
||||
|
||||
// compute the operation for this input coordinate
|
||||
ReduceElemType sum = 0;
|
||||
ReduceElemType aggregate = NeutralValue<ReduceElemType>(reductionOp);
|
||||
|
||||
for (CUDA_LONG redId = reductionBegin + tid; redId < reductionEnd; redId += tids)
|
||||
{
|
||||
auto val = TensorOpParallelReduce<ElemType, N, M, M - 1>::Compute(redId, pointers, op, reducingOpDims, reducingStrides);
|
||||
sum += val;
|
||||
UpdateAggregate<ReduceElemType, ElemType>(aggregate, val, reductionOp);
|
||||
}
|
||||
|
||||
// reduce --cf https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf
|
||||
__shared__ ReduceElemType volatile accumulators[GridDim::maxThreadsPerBlock /*tids*/];
|
||||
accumulators[tid] = sum;
|
||||
accumulators[tid] = aggregate;
|
||||
__syncthreads();
|
||||
static_assert(GridDim::maxThreadsPerBlock <= 512, "GridDim::maxThreadsPerBlock too large, need to add manually unrolled steps");
|
||||
for (CUDA_LONG i = 256; i; i >>= 1)
|
||||
static_assert(GridDim::maxThreadsPerBlock <= 1024, "GridDim::maxThreadsPerBlock too large, need to add manually unrolled steps");
|
||||
for (CUDA_LONG i = 512; i; i >>= 1)
|
||||
{
|
||||
if (tid < i && tid + i < tids)
|
||||
accumulators[tid] += accumulators[tid + i];
|
||||
UpdateAggregate<volatile ReduceElemType, volatile ReduceElemType>(accumulators[tid], accumulators[tid + i], reductionOp);
|
||||
|
||||
if (0 + i < tids)
|
||||
__syncthreads(); // sync if condition true for at least one thread
|
||||
// TODO: use volatile* and then we can skip the __syncthreads() for the last 32 values. See Amit's allreduce() function implementation in MatrixQuantizer_kernel.cu.
|
||||
|
@ -496,13 +557,13 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
|
|||
|
||||
// launch tensor op with CUDA
|
||||
template <class ElemType, C_size_t N, C_int M, C_int K>
|
||||
__global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
|
||||
__global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides, CUDA_LONG numElements,
|
||||
FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides)
|
||||
{
|
||||
CUDA_LONG id = GridDim::GetLinearThreadId();
|
||||
if (id < numElements) // note: there are no __syncthread() calls inside
|
||||
TensorOpElement<ElemType, N, M, K, false, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, 0, 0);
|
||||
TensorOpElement<ElemType, N, M, K, false, K - 1>::Compute(id, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, 0, 0);
|
||||
}
|
||||
|
||||
template <class ElemType, C_size_t N, C_int K>
|
||||
|
@ -527,7 +588,7 @@ static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, Ele
|
|||
CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual input element
|
||||
SyncGuard syncGuard;
|
||||
GridDim grid(NN);
|
||||
_launchTensorOp<ElemType, N, /*M=*/0, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
|
||||
_launchTensorOp<ElemType, N, /*M=*/0, K> <<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >>>(beta, pointers, alpha, op, (ElementWiseOperator)(-1) /* dummy reductionOp */, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -535,7 +596,7 @@ static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, Ele
|
|||
// -----------------------------------------------------------------------
|
||||
|
||||
template <class ElemType, C_size_t N, C_int M, C_int K>
|
||||
__global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
|
||||
__global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides, CUDA_LONG numElements,
|
||||
FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides,
|
||||
CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
|
||||
|
@ -546,7 +607,7 @@ __global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*
|
|||
pointers[pointers.size() - 1] += numElements * reductionBlock; // the output tensor is dense (no gaps); and there is one copy for each reduction block (those get further reduced into one later)
|
||||
#endif
|
||||
if (id < numElements) // note: we have __syncthread() calls but only entire blocks in sync, so this is OK
|
||||
TensorOpElement<ElemType, N, M, K, true, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
|
||||
TensorOpElement<ElemType, N, M, K, true, K - 1>::Compute(id, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
|
||||
}
|
||||
|
||||
// helper function to provide a reduction buffer
|
||||
|
@ -582,7 +643,7 @@ static shared_ptr<ElemType> GetReductionBuffer(size_t N)
|
|||
|
||||
// All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
|
||||
template <class ElemType, C_size_t N, C_int M, C_int K>
|
||||
static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
|
||||
static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrideVectors,
|
||||
const SmallVector<size_t>& reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N>& reducingStrideVectors)
|
||||
{
|
||||
|
@ -601,7 +662,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
|
|||
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
|
||||
|
||||
// launch the kernel
|
||||
CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual input element
|
||||
CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual output element
|
||||
SyncGuard syncGuard;
|
||||
|
||||
// do some optimization for reductions
|
||||
|
@ -631,7 +692,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
|
|||
{
|
||||
// we got enough elements to generate: do one element per thread, and reduction inside
|
||||
_launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
|
||||
beta, pointers, alpha, op,
|
||||
beta, pointers, alpha, op, reductionOp,
|
||||
regularOpStrides, regularStrides, grid.m_N,
|
||||
reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
@ -684,9 +745,9 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
|
|||
if (numReductionChunks == 1)
|
||||
{
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
|
||||
beta, pointers, alpha, op,
|
||||
beta, pointers, alpha, op, reductionOp,
|
||||
regularOpStrides, regularStrides, NN,
|
||||
reducingOpDims, reducingStrides, 0, reductionChunkSize);
|
||||
reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize);
|
||||
}
|
||||
// --- case (b)
|
||||
// Reduction across blocks. This is the difficult one.
|
||||
|
@ -721,7 +782,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
|
|||
ElemType beta1 = 0;
|
||||
ElemType alpha1 = 1;
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(
|
||||
beta1, pointers1, alpha1, op,
|
||||
beta1, pointers1, alpha1, op, reductionOp,
|
||||
regularOpStrides, regularStrides1, NN,
|
||||
reducingOpDims, reducingStrides, /*reductionBegin*/0, reductionChunkSize);
|
||||
|
||||
|
@ -738,14 +799,14 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
|
|||
const array<SmallVector<ptrdiff_t>, 2> reducingStrideVectors2{ SmallVector<ptrdiff_t>{ NN }, SmallVector<ptrdiff_t>{ 0 } };
|
||||
const SmallVector<size_t> reducingOpDimVector2{ (size_t)numReductionChunks };
|
||||
LaunchTensorOpWithReduction<ElemType, /*N=*/2, /*M=*/1, K>(
|
||||
beta, pointerVector2, alpha, ElementWiseOperator::opCopy,
|
||||
beta, pointerVector2, alpha, ElementWiseOperator::opCopy, reductionOp,
|
||||
regularOpDims, regularStrideVectors2,
|
||||
reducingOpDimVector2, reducingStrideVectors2);
|
||||
// (note: ^^this will have a nested syncGuard, which is fine)
|
||||
|
||||
#else
|
||||
_launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
|
||||
beta, pointers, alpha, op,
|
||||
beta, pointers, alpha, op, reductionOp,
|
||||
regularOpStrides, regularStrides, grid.m_N,
|
||||
reducingOpDims, reducingStrides);
|
||||
//for (size_t z = 0; z < numBlocksZ; z++)
|
||||
|
@ -768,16 +829,16 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
|
|||
else if (beta == 1)
|
||||
{
|
||||
// no need to pre-scale; just add (common for gradients)
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
// We need more than one chunk, we will use atomicAdd().
|
||||
// First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
|
||||
// We will leave it like this for a while, but eventually need to revisit using temporary memory.
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(/*beta=*/1, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -856,7 +917,7 @@ void LaunchUnaryTensorOp(ElemType beta, const ElemType* pa, ElemType* pb, ElemTy
|
|||
|
||||
// tensor operation with k+1 dimensions (-1 means scalar)
|
||||
template <class ElemType, C_size_t N, C_int K>
|
||||
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
|
||||
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
{
|
||||
|
@ -864,9 +925,9 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
|
|||
switch (dims)
|
||||
{
|
||||
case 2:
|
||||
return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1:
|
||||
return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0:
|
||||
return LaunchTensorOp<ElemType, N, K>(beta, pointers, alpha, op, regularOpDims, regularStrides);
|
||||
default:
|
||||
|
@ -877,7 +938,7 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
|
|||
// tensor operation, generalized in number of arguments
|
||||
// This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
|
||||
template <class ElemType, C_size_t N>
|
||||
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
|
||||
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, N>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
|
||||
|
@ -888,15 +949,15 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
|
|||
switch (dims)
|
||||
{
|
||||
case 4:
|
||||
return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 3:
|
||||
return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 2:
|
||||
return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1:
|
||||
return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0:
|
||||
return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
default:
|
||||
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (C_int) dims);
|
||||
}
|
||||
|
@ -906,27 +967,27 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
|
|||
// explicit instantiations--these are being called from GPUMatrix.cu
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
template void TensorOpN<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op,
|
||||
template void TensorOpN<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
template void TensorOpN<float, 3>(float beta, array<float*, 3> pointers, float alpha, ElementWiseOperator op,
|
||||
template void TensorOpN<float, 3>(float beta, array<float*, 3> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 3>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
|
||||
template void TensorOpN<float, 4>(float beta, array<float*, 4> pointers, float alpha, ElementWiseOperator op,
|
||||
template void TensorOpN<float, 4>(float beta, array<float*, 4> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 4>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
|
||||
template void TensorOpN<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op,
|
||||
template void TensorOpN<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 2>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
|
||||
template void TensorOpN<double, 3>(double beta, array<double*, 3> pointers, double alpha, ElementWiseOperator op,
|
||||
template void TensorOpN<double, 3>(double beta, array<double*, 3> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 3>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
|
||||
template void TensorOpN<double, 4>(double beta, array<double*, 4> pointers, double alpha, ElementWiseOperator op,
|
||||
template void TensorOpN<double, 4>(double beta, array<double*, 4> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, 4>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
|
||||
|
|
|
@ -18,11 +18,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
#define C_unsigned_int CUDA_LONG
|
||||
|
||||
template <class ElemType, C_size_t N>
|
||||
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
|
||||
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
|
||||
const array<size_t, N>& offsets,
|
||||
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
|
||||
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides);
|
||||
|
||||
template <class ElemType>
|
||||
void LaunchUnaryTensorOp(ElemType beta, const ElemType* pa, ElemType* pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim);
|
||||
} } }
|
||||
|
||||
}}}
|
||||
|
|
|
@ -175,6 +175,7 @@
|
|||
<ClInclude Include="RNGHandle.h" />
|
||||
<ClInclude Include="TensorOps.h" />
|
||||
<ClInclude Include="TensorView.h" />
|
||||
<ClInclude Include="Quantizers.h" />
|
||||
<None Include="GPUWatcher.cu" />
|
||||
<None Include="GPUWatcher.h">
|
||||
<FileType>CppHeader</FileType>
|
||||
|
|
|
@ -123,6 +123,7 @@
|
|||
<ClInclude Include="BlockMultiplierPlatform.h">
|
||||
<Filter>CPU</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Quantizers.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="GPUMatrix.h">
|
||||
|
|
|
@ -1139,7 +1139,12 @@ template <>
|
|||
/*static*/ char Matrix<char>::MakeNan(size_t)
|
||||
{
|
||||
return 0;
|
||||
} // (needed for completeness)
|
||||
} // (needed for completeness and to pass unit tests)
|
||||
template <>
|
||||
/*static*/ short Matrix<short>::MakeNan(size_t)
|
||||
{
|
||||
return 0;
|
||||
} // (needed for completeness and to pass unit tests)
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::MaskColumnsValue(const Matrix<char>& columnsMask, ElemType val)
|
||||
|
@ -4289,7 +4294,8 @@ void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale,
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
|
||||
void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, double blendFactor,
|
||||
const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
|
||||
Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const
|
||||
{
|
||||
DecideAndMoveToRightDevice(*this, grad);
|
||||
|
@ -4297,10 +4303,10 @@ void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Ma
|
|||
// REVIEW alexeyk: add sparse version.
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->BatchNormalizationBackward(*(in.m_CPUMatrix), *(grad.m_CPUMatrix), *(scale.m_CPUMatrix),
|
||||
m_CPUMatrix->BatchNormalizationBackward(*(in.m_CPUMatrix), *(grad.m_CPUMatrix), *(scale.m_CPUMatrix), blendFactor,
|
||||
*(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix),
|
||||
*(scaleGrad.m_CPUMatrix), *(biasGrad.m_CPUMatrix)),
|
||||
m_GPUMatrix->BatchNormalizationBackward(*(in.m_GPUMatrix), *(grad.m_GPUMatrix), *(scale.m_GPUMatrix),
|
||||
m_GPUMatrix->BatchNormalizationBackward(*(in.m_GPUMatrix), *(grad.m_GPUMatrix), *(scale.m_GPUMatrix), blendFactor,
|
||||
*(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix),
|
||||
*(scaleGrad.m_GPUMatrix), *(biasGrad.m_GPUMatrix)),
|
||||
NOT_IMPLEMENTED,
|
||||
|
@ -5401,6 +5407,7 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const
|
|||
NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
//template class Matrix<short>;
|
||||
template class Matrix<float>;
|
||||
template class Matrix<double>;
|
||||
|
||||
|
@ -5430,6 +5437,31 @@ template void Matrix<char>::Resize(const size_t numRows, const size_t numCols, c
|
|||
template void Matrix<char>::Reshape(const size_t, const size_t);
|
||||
template char* Matrix<char>::CopyToArray(void) const;
|
||||
|
||||
// Matrix<short> methods
|
||||
template Matrix<short>::Matrix(DEVICEID_TYPE);
|
||||
template Matrix<short>::Matrix(Matrix<short>&&);
|
||||
template Matrix<short>::Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId, const MatrixType matrixType, const MatrixFormat matrixFormat);
|
||||
template Matrix<short>::Matrix(const size_t numRows, const size_t numCols, short* pArray, DEVICEID_TYPE deviceId, const size_t matrixFlags, const size_t nnz);
|
||||
template Matrix<short>::~Matrix();
|
||||
template Matrix<short>& Matrix<short>::operator=(Matrix<short>&& moveFrom);
|
||||
template short* Matrix<short>::Data() const;
|
||||
template int Matrix<short>::GetDeviceId() const;
|
||||
template size_t Matrix<short>::GetNumElements() const;
|
||||
template Matrix<short> Matrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
|
||||
template void Matrix<short>::_transferToDevice(int id_to, bool isBeingMoved, bool emptyTransfer) const;
|
||||
template void Matrix<short>::TransferToDeviceIfNotThere(int id_to, bool isBeingMoved, bool emptyTransfer, bool updatePreferredDevice) const;
|
||||
template size_t Matrix<short>::GetNumRows() const;
|
||||
template size_t Matrix<short>::GetNumCols() const;
|
||||
template void Matrix<short>::SetValue(const short);
|
||||
template void Matrix<short>::SetValue(size_t numRows, const size_t numCols, int deviceId, short* pArray, size_t matrixFlags);
|
||||
//template void Matrix<short>::SetValue(const Matrix<short>&, MatrixFormat);
|
||||
template void Matrix<short>::SetValue(const Matrix<short>&);
|
||||
template void Matrix<short>::AssignValuesOf(const Matrix<short>&);
|
||||
template bool Matrix<short>::IsEmpty() const;
|
||||
template void Matrix<short>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, bool growOnly);
|
||||
template void Matrix<short>::Reshape(const size_t, const size_t);
|
||||
template short* Matrix<short>::CopyToArray(void) const;
|
||||
|
||||
template Matrix<int>::Matrix(const size_t, const size_t, int*, DEVICEID_TYPE, const size_t, const size_t);
|
||||
|
||||
}}}
|
||||
|
|
|
@ -503,7 +503,7 @@ public:
|
|||
void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
Matrix<ElemType>& runMean, Matrix<ElemType>& runInvStdDev, Matrix<ElemType>& out, double epsilon,
|
||||
Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const;
|
||||
void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
|
||||
void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, double blendFactor, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
|
||||
Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const;
|
||||
|
||||
public:
|
||||
|
|
|
@ -708,6 +708,7 @@ void GPUSparseMatrix<ElemType>::ConvertBuffer(OutType* outBuffer, const InType*
|
|||
|
||||
#pragma endregion Helper Functions
|
||||
|
||||
template class MATH_API GPUSparseMatrix<short>;
|
||||
template class MATH_API GPUSparseMatrix<char>;
|
||||
template class MATH_API GPUSparseMatrix<float>;
|
||||
template class MATH_API GPUSparseMatrix<double>;
|
||||
|
@ -1832,7 +1833,7 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale,
|
||||
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
|
||||
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
|
||||
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
|
||||
{
|
||||
|
@ -2216,6 +2217,7 @@ GPURNGHandle::GPURNGHandle(int deviceId, unsigned long seed)
|
|||
|
||||
#pragma endregion GPURNGHandle functions
|
||||
|
||||
template class GPUMatrix<short>;
|
||||
template class GPUMatrix<char>;
|
||||
template class GPUMatrix<float>;
|
||||
template class GPUMatrix<double>;
|
||||
|
@ -2276,6 +2278,9 @@ float CudaTimer::Elapsed()
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*static*/ void SyncGuard::EnableSync()
|
||||
{
|
||||
}
|
||||
} } }
|
||||
|
||||
// define a dummy GPUWatcher class too
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
#pragma once
|
||||
#include "Basics.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// RawType - input type to the quantizer. Currently CNTK supports float or double as RawType.
|
||||
// QuantizedType - output type of the quantizer
|
||||
template <class RawType, class QuantizedType>
|
||||
class QuantizerBase
|
||||
{
|
||||
public:
|
||||
QuantizerBase()
|
||||
{
|
||||
rangeMax = std::numeric_limits<QuantizedType>::max();
|
||||
}
|
||||
virtual void Quantize(const ArrayRef<RawType>& input, ArrayRef<QuantizedType>& output) = 0;
|
||||
virtual void Dequantize(const ArrayRef<QuantizedType>& input, ArrayRef<RawType>& output) = 0;
|
||||
|
||||
protected:
|
||||
QuantizedType rangeMax;
|
||||
};
|
||||
|
||||
// Symmetric quantizer.
|
||||
// Quantization is achieved by
|
||||
// 1. Finding the absolute max of values to be quantized.
|
||||
// 2. Adjusting the absolute max with extraBits parameter.
|
||||
// 3. Scaling all values in the collection to be within the symmetric range of the QuantizedType
|
||||
template <class RawType, class QuantizedType>
|
||||
class SymmetricQuantizer : public QuantizerBase<RawType, QuantizedType>
|
||||
{
|
||||
RawType m_quantizeFactor;
|
||||
RawType m_inverseQuantizerFactor;
|
||||
RawType m_absMax;
|
||||
public:
|
||||
// elements - collection to be quantized
|
||||
// extraBits decreases the quantization normalizer to prevent integer overflow during BLAS routines.
|
||||
// Higher extraBits will decrease precision of quantization, but will make BLAS routines less prone to overflow.
|
||||
// For quantization with shorts, recommended value of extraBits is 1-3.
|
||||
// This constructor accepts the collection of RawType to initialize internal quantizer
|
||||
// and then apply this quantizer to collections with similar range as the one it was initialized with.
|
||||
SymmetricQuantizer(const ArrayRef<RawType>& input, size_t extraBits)
|
||||
{
|
||||
m_absMax = FindAbsMax(input);
|
||||
Initialize(m_absMax, extraBits);
|
||||
}
|
||||
|
||||
// absoluteMax - the range of the quantizer (normally represents maximum absolute value of the values in the collection to be quantized).
|
||||
// extraBits - see comment in another ctor
|
||||
SymmetricQuantizer(RawType absoluteMax, size_t extraBits)
|
||||
{
|
||||
Initialize(absoluteMax, extraBits);
|
||||
}
|
||||
|
||||
// Perform quantization of the input collection, put result into pre-allocated output collection
|
||||
virtual void Quantize(const ArrayRef<RawType>& input, ArrayRef<QuantizedType>& output)
|
||||
{
|
||||
assert(input.size() == output.size());
|
||||
|
||||
for (size_t i = 0; i < input.size(); i++)
|
||||
{
|
||||
#ifdef _DEBUG
|
||||
assert(abs(input[i]) <= m_absMax);
|
||||
#endif
|
||||
output[i] = (QuantizedType) round((input[i] * m_quantizeFactor));
|
||||
}
|
||||
}
|
||||
|
||||
// Accept quantized collection as input, put de-quantization result into pre-allocated output collection.
|
||||
virtual void Dequantize(const ArrayRef<QuantizedType>& input, ArrayRef<RawType>& output)
|
||||
{
|
||||
assert(input.size() == output.size());
|
||||
|
||||
for (size_t i = 0; i < input.size(); i++)
|
||||
{
|
||||
output[i] = (RawType)(input[i] * m_inverseQuantizerFactor);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Find absolute maximum value
|
||||
RawType FindAbsMax(const ArrayRef<RawType>& arrayRef)
|
||||
{
|
||||
RawType maxElem = *std::max_element(arrayRef.begin(), arrayRef.end());
|
||||
RawType minElem = *std::min_element(arrayRef.begin(), arrayRef.end());
|
||||
|
||||
return std::max(maxElem, std::abs(minElem));
|
||||
}
|
||||
|
||||
void Initialize(RawType absoluteMax, size_t extraBits)
|
||||
{
|
||||
RawType shiftedMax = absoluteMax * (1 << extraBits);
|
||||
if (shiftedMax == 0)
|
||||
{
|
||||
LogicError("The absolute max element in the sequence to be quantized is 0.");
|
||||
}
|
||||
m_absMax = absoluteMax;
|
||||
m_quantizeFactor = rangeMax / shiftedMax;
|
||||
m_inverseQuantizerFactor = 1 / m_quantizeFactor;
|
||||
}
|
||||
};
|
||||
|
||||
}}}
|
|
@ -261,6 +261,8 @@ DefTernaryOp(Cond, a ? b : c);
|
|||
DefTernaryOp(CopyIfEqual, a == b ? c : 0); // CopyIfEqual(a,b)(c) -- if a==b copy c, otherwise 0; used for gradient of clip, min, max, etc.
|
||||
DefTernaryOp(Clip, c < a ? a : (c > b ? b : c)); // Clip(min,max)(data) => a=min, b=max, c=data
|
||||
DefTernaryOp(ElementwiseProductWithLogSumDerivative, a * Sigmoid(c - b));
|
||||
DefTernaryOp(ElementwiseProductWithExpOfDiff, a * exp_(b - c));
|
||||
|
||||
|
||||
#pragma pop_macro("DefTernaryOp")
|
||||
}}}
|
||||
|
|
|
@ -14,6 +14,10 @@
|
|||
#pragma warning(push)
|
||||
#pragma warning(disable : 4251) // needs to have dll-interface to be used by clients of... caused by TensorView::m_shape which is only private. We use the same compiler everywhere.
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
|
||||
template <class ElemType> struct TensorTest;
|
||||
}}}}
|
||||
|
||||
// This class is exported from the Math.dll.
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -149,6 +153,7 @@ private:
|
|||
|
||||
const Matrix<ElemType>& GetSOB() const { return *m_sob; }
|
||||
Matrix<ElemType>& GetSOB() { return *m_sob; }
|
||||
friend Test::TensorTest<ElemType>;
|
||||
|
||||
// -------------------------------------------------------------------
|
||||
// sob members
|
||||
|
|
|
@ -46,7 +46,7 @@ CNTKTextFormatReader::CNTKTextFormatReader(MemoryProviderPtr provider,
|
|||
{
|
||||
// Verbosity is a general config parameter, not specific to the text format reader.
|
||||
int verbosity = config(L"verbosity", 0);
|
||||
m_randomizer = make_shared<BlockRandomizer>(verbosity, window, m_deserializer);
|
||||
m_randomizer = make_shared<BlockRandomizer>(verbosity, window, m_deserializer, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -100,7 +100,7 @@ CompositeDataReader::CompositeDataReader(const ConfigParameters& config, MemoryP
|
|||
size_t randomizationWindow = config(L"randomizationWindow", requestDataSize);
|
||||
// By default using STL random number generator.
|
||||
bool useLegacyRandomization = config(L"useLegacyRandomization", false);
|
||||
m_sequenceEnumerator = std::make_shared<BlockRandomizer>(verbosity, randomizationWindow, deserializer, BlockRandomizer::DecimationMode::chunk, useLegacyRandomization, multiThreadedDeserialization);
|
||||
m_sequenceEnumerator = std::make_shared<BlockRandomizer>(verbosity, randomizationWindow, deserializer, true /* should Prefetch */, BlockRandomizer::DecimationMode::chunk, useLegacyRandomization, multiThreadedDeserialization);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -251,7 +251,7 @@ void CompositeDataReader::StartEpoch(const EpochConfiguration& cfg)
|
|||
|
||||
if (config.m_totalEpochSizeInSamples <= 0)
|
||||
{
|
||||
RuntimeError("Unsupported minibatch size '%d'.", (int)config.m_totalEpochSizeInSamples);
|
||||
RuntimeError("Unsupported epoch size '%d'.", (int)config.m_totalEpochSizeInSamples);
|
||||
}
|
||||
|
||||
m_sequenceEnumerator->StartEpoch(config);
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче