This commit is contained in:
xiaohuliu 2016-08-08 15:23:32 -07:00
Родитель eccd3d0571 595c7abe72
Коммит 9db12ddf3d
286 изменённых файлов: 74533 добавлений и 57215 удалений

9
.gitattributes поставляемый
Просмотреть файл

@ -6,6 +6,7 @@ Dockerfile-GPU text
*.counts text
*.labels text
*.feats text
*.ctf text
*.post text
*.cpu text
*.gpu text
@ -19,6 +20,7 @@ Dockerfile-GPU text
*.md text
*.txt text
*.TXT text
*.html text
*.lyx text
*.bib text
@ -44,6 +46,9 @@ make_binary_drop_linux text eol=lf
Tests/EndToEndTests/Examples/Speech/TIMIT/WriteBottleneck/expected_output_md5sum.*.txt eol=lf
Tests/EndToEndTests/Examples/Speech/TIMIT/WriteScaledLogLike/expected_output_md5sum.*.txt eol=lf
# Used by reader unit test, needs to keep LF line endings.
Tests/UnitTests/ReaderTests/Data/CNTKTextFormatReader/invalid_inputs.txt eol=lf
Makefile text
*.sln text
*.vcxproj text
@ -106,6 +111,10 @@ TIMIT*.statelist text
TIMIT*.tfsa text
TIMIT*.transitions text
Examples/Text/ATIS/data/ATIS.* text
Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b* text
# Binary extensions:
*.vsdm binary
*.pdf binary

1
.gitignore поставляемый
Просмотреть файл

@ -65,6 +65,7 @@ ipch/
*.opensdf
*.sdf
*.cachefile
*.userosscache
# Visual Studio profiler
*.psess

Просмотреть файл

@ -934,7 +934,7 @@ EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Data", "Data", "{D11F76CC-DB6D-4CB4-B3B7-AB139DE2F5FA}"
ProjectSection(SolutionItems) = preProject
Tests\EndToEndTests\Text\SequenceClassification\Data\embeddingmatrix.txt = Tests\EndToEndTests\Text\SequenceClassification\Data\embeddingmatrix.txt
Tests\EndToEndTests\Text\SequenceClassification\Data\Train.txt = Tests\EndToEndTests\Text\SequenceClassification\Data\Train.txt
Tests\EndToEndTests\Text\SequenceClassification\Data\Train.ctf = Tests\EndToEndTests\Text\SequenceClassification\Data\Train.ctf
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{181664AC-4C95-4798-A923-09B879215B33}"
@ -1120,6 +1120,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKv2LibraryDll", "Source\
{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}
EndProjectSection
@ -1147,6 +1148,11 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClient", "Examples\E
{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BrainScriptTests", "Tests\UnitTests\BrainScriptTests\BrainScriptTests.vcxproj", "{9F999212-AFC5-4EAC-AA78-F7247D46C456}"
ProjectSection(ProjectDependencies) = postProject
{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug_CpuOnly|x64 = Debug_CpuOnly|x64
@ -1425,6 +1431,14 @@ Global
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release|x64.ActiveCfg = Release|x64
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release|x64.Build.0 = Release|x64
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug|x64.ActiveCfg = Debug|x64
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug|x64.Build.0 = Debug|x64
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release|x64.ActiveCfg = Release|x64
{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@ -1583,5 +1597,6 @@ Global
{3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA} = {47755F2E-D674-4175-9E38-8EA053455072}
{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
{9F999212-AFC5-4EAC-AA78-F7247D46C456} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
EndGlobalSection
EndGlobal

Просмотреть файл

@ -863,38 +863,27 @@ The dimension reduced matrix consisting of the maximum value within each pooling
This function is often associated with Convolution() operations.
### Delay
### PastValue, FutureValue
Delay node used in recurrent networks, allows creation of a loop in the convolutional network that will repeat a specified number of times.
PastValue and FutureValue nodes are used in recurrent networks, allow creation of a loop in the convolutional network that will repeat a specified number of times. PastValue retrieves the value of a node several steps away in the past, while FutureValue retrieves the value of a node from future.
`Delay(rows, [cols], delayNode, delayTime=1, needGradient=true, defaultHiddenActivity=0.1)`
`PastValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`
`FutureValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`
#### Parameters
`cvweight` – convolution weight matrix, it has the dimensions of \[outputChannels, kernelWidth \* kernelHeight \* inputChannels\]
`rows` – number of rows in the node
`kernelWidth` – width of the kernel
`cols` – number of cols in the node. This value is often ommit since the length of a sequence varies
`kernelHeight` – height of the kernel
`timeStep` – \[default = 1\] number of time steps toward the past and future
`outputChannels` – number of output channels
`horizontalSubsample` – subsamples in the horizontal direction
`verticalSubsample` – subsamples in the vertical direction
#### Optional Parameters
`delayTime` – \[default = 1\] the amount of delay that will be introduced (number of times the loop will happen)
`needGradient` – \[default = true\] does the gradient need to be computed for this node
`defaultHiddenActivity` – \[default = 0.1\] the numerical amount for the defaultHiddenActivity
`defaultHiddenActivity` – \[default = 0.1\] default value to use when passing the sequence bounday or when the value is missing.
#### Returns
The results of the completed Delay loop
Either the past or future value of a node
#### Notes
This node is used in recurrent networks, where a delay is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network that will repeat delayTime number of iterations.
This node is used in recurrent networks, where a past value is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network.

Просмотреть файл

@ -37,19 +37,22 @@ int main(int argc, char* argv[])
std::string app = argv[0];
std::string path;
IEvaluateModel<float> *model;
size_t pos;
#ifdef _WIN32
path = app.substr(0, app.rfind("\\"));
pos = app.rfind("\\");
path = (pos == std::string::npos) ? "." : app.substr(0, pos);
// This relative path assumes launching from CNTK's binary folder, e.g. x64\Release
const std::string modelWorkingDirectory = path + "/../../Examples/Image/MNIST/Data/";
#else // on Linux
path = app.substr(0, app.rfind("/"));
pos = app.rfind("/");
path = (pos == std::string::npos) ? "." : app.substr(0, pos);
// This relative path assumes launching from CNTK's binary folder, e.g. build/release/bin/
const std::string modelWorkingDirectory = path + "/../../../Examples/Image/MNIST/Data/";
#endif
GetEvalF(&model);
const std::string modelFilePath = modelWorkingDirectory + "../Output/Models/01_OneHidden";

Просмотреть файл

@ -110,9 +110,15 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
public static bool Evaluate(string record)
{
var model = Models.Take();
var outcome = model.EvaluateRecord(record);
Models.Add(model);
return outcome;
try
{
var outcome = model.EvaluateRecord(record);
return outcome;
}
finally
{
Models.Add(model);
}
}
/// <summary>
@ -123,9 +129,15 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
public static List<float> Evaluate(List<float> inputs)
{
var model = Models.Take();
var outcome = model.EvaluateInput(inputs);
Models.Add(model);
return outcome;
try
{
var outcome = model.EvaluateInput(inputs);
return outcome;
}
finally
{
Models.Add(model);
}
}
/// <summary>

Просмотреть файл

@ -1,69 +1,69 @@
'
</s>
<s/>
<s>
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
~AA
~AE
~AH
~AO
~AW
~AY
~B
~CH
~D
~DH
~EH
~ER
~EY
~F
~G
~HH
~IH
~IY
~JH
~K
~L
~M
~N
~NG
~OW
~OY
~P
~R
~S
~SH
~T
~TH
~UH
~UW
~V
~W
~Y
~Z
~ZH
'
</s>
<s/>
<s>
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
~AA
~AE
~AH
~AO
~AW
~AY
~B
~CH
~D
~DH
~EH
~ER
~EY
~F
~G
~HH
~IH
~IY
~JH
~K
~L
~M
~N
~NG
~OW
~OY
~P
~R
~S
~SH
~T
~TH
~UH
~UW
~V
~W
~Y
~Z
~ZH

Просмотреть файл

@ -18,9 +18,9 @@ ndlMacroDefine = [
]
LSTMPComponent(inputDim, outputDim, cellDim, inputx, cellDimX2, cellDimX3, cellDimX4) = [
wx = Parameter(cellDimX4, inputDim, init="uniform", initValueScale=1);
b = Parameter(cellDimX4, 1, init="fixedValue", value=0.0);
Wh = Parameter(cellDimX4, outputDim, init="uniform", initValueScale=1);
wx = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);
b = Parameter(cellDimX4, 1, init="fixedValue", value=0.0);
Wh = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);
Wci = Parameter(cellDim, init="uniform", initValueScale=1);
Wcf = Parameter(cellDim, init="uniform", initValueScale=1);
@ -63,9 +63,9 @@ ndlMacroDefine = [
]
LSTMPComponentBetter(inputDim, outputDim, cellDim, inputx, cellDimX2, cellDimX3, cellDimX4) = [
wx = Parameter(cellDimX4, inputDim, init="uniform", initValueScale=1);
b = Parameter(cellDimX4, 1, init="fixedValue", value=0.0);
Wh = Parameter(cellDimX4, outputDim, init="uniform", initValueScale=1);
wx = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);
b = Parameter(cellDimX4, 1, init="fixedValue", value=0.0);
Wh = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);
Wci = Parameter(cellDim, init="uniform", initValueScale=1);
Wcf = Parameter(cellDim, init="uniform", initValueScale=1);
@ -112,26 +112,26 @@ ndlMacroDefine = [
]
LSTMPComponentNaive(inputDim, outputDim, cellDim, inputx) = [
Wxo = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
Wxi = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
Wxf = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
Wxc = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
Wxo = Parameter(cellDim, 0, init="uniform", initValueScale=1);
Wxi = Parameter(cellDim, 0, init="uniform", initValueScale=1);
Wxf = Parameter(cellDim, 0, init="uniform", initValueScale=1);
Wxc = Parameter(cellDim, 0, init="uniform", initValueScale=1);
bo = Parameter(cellDim, init="fixedValue", value=0.0);
bc = Parameter(cellDim, init="fixedValue", value=0.0);
bi = Parameter(cellDim, init="fixedValue", value=0.0);
bf = Parameter(cellDim, init="fixedValue", value=0.0);
Whi = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
Whi = Parameter(cellDim, 0, init="uniform", initValueScale=1);
Wci = Parameter(cellDim, init="uniform", initValueScale=1);
Whf = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
Whf = Parameter(cellDim, 0, init="uniform", initValueScale=1);
Wcf = Parameter(cellDim, init="uniform", initValueScale=1);
Who = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
Who = Parameter(cellDim, 0, init="uniform", initValueScale=1);
Wco = Parameter(cellDim, init="uniform", initValueScale=1);
Whc = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
Whc = Parameter(cellDim, 0, init="uniform", initValueScale=1);
dh = PastValue(outputDim, output, timeStep=1);
dc = PastValue(cellDim, ct, timeStep=1);
@ -194,8 +194,8 @@ ndlCreateNetwork_LSTMP_c1024_p256_x3 = [
# layer 3
LSTMoutput3 = LSTMPComponent(hiddenDim, hiddenDim, cellDim, LSTMoutput2, cellDimX2, cellDimX3, cellDimX4);
W = Parameter(labelDim, hiddenDim, init="uniform", initValueScale=1);
b = Parameter(labelDim, 1, init="fixedValue", value=0);
W = Parameter(labelDim, 0, init="uniform", initValueScale=1);
b = Parameter(labelDim, 1, init="fixedValue", value=0);
LSTMoutputW = Plus(Times(W, LSTMoutput3), b);
ce = CrossEntropyWithSoftmax(labels, LSTMoutputW);

203
Makefile
Просмотреть файл

@ -17,8 +17,10 @@
# version for the CNTK custom MKL installation
# MKL_THREADING=parallel|sequential
# only needed if MATHLIB=mkl
# GDK_PATH= path to cuda gdk installation, so $(GDK_PATH)/include/nvidia/gdk/nvml.h exists
# defaults to /usr
# GDK_INCLUDE_PATH= path to CUDA GDK include path, so $(GDK_INCLUDE_PATH)/nvml.h exists
# defaults to /usr/include/nvidia/gdk
# GDK_NVML_LIB_PATH= path to CUDA GDK (stub) library path, so $(GDK_NVML_LIB_PATH)/libnvidia-ml.so exists
# defaults to /usr/src/gdk/nvml/lib
# MATHLIB= One of acml or mkl
# defaults to acml
# CUDA_PATH= Path to CUDA
@ -29,10 +31,12 @@
# If not specified, CNTK will be be built without cuDNN.
# KALDI_PATH= Path to Kaldi
# If not specified, Kaldi plugins will not be built
# OPENCV_PATH= path to OpenCV 3.0.0 installation, so $(OPENCV_PATH) exists
# defaults to /usr/local/opencv-3.0.0
# OPENCV_PATH= path to OpenCV 3.1.0 installation, so $(OPENCV_PATH) exists
# defaults to /usr/local/opencv-3.1.0
# LIBZIP_PATH= path to libzip installation, so $(LIBZIP_PATH) exists
# defaults to /usr/local/
# BOOST_PATH= path to Boost installation, so $(BOOST_PATH)/include/boost/test/unit_test.hpp
# defaults to /usr/local/boost-1.60.0
# These can be overridden on the command line, e.g. make BUILDTYPE=debug
# TODO: Build static libraries for common dependencies that are shared by multiple
@ -71,7 +75,7 @@ INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2L
# COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
CPPFLAGS:=
CXXFLAGS:= -msse3 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
CXXFLAGS:= -msse4.1 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
LIBPATH:=
LIBS:=
LDFLAGS:=
@ -93,9 +97,14 @@ all : buildall
CUFLAGS = -m 64
ifdef CUDA_PATH
ifndef GDK_PATH
$(info defaulting GDK_PATH to /usr)
GDK_PATH=/usr
ifndef GDK_INCLUDE_PATH
GDK_INCLUDE_PATH=/usr/include/nvidia/gdk
$(info defaulting GDK_INCLUDE_PATH to $(GDK_INCLUDE_PATH))
endif
ifndef GDK_NVML_LIB_PATH
GDK_NVML_LIB_PATH=/usr/src/gdk/nvml/lib
$(info defaulting GDK_NVML_LIB_PATH to $(GDK_NVML_LIB_PATH))
endif
ifndef CUB_PATH
@ -107,10 +116,8 @@ ifdef CUDA_PATH
NVCC = $(CUDA_PATH)/bin/nvcc
# This is a suggested/default location for NVML
INCLUDEPATH+=$(GDK_PATH)/include/nvidia/gdk
INCLUDEPATH+=$(GDK_INCLUDE_PATH)
INCLUDEPATH+=$(CUB_PATH)
NVMLLIBPATH=$(GDK_PATH)/src/gdk/nvml/lib
# Set up CUDA includes and libraries
INCLUDEPATH += $(CUDA_PATH)/include
@ -328,7 +335,7 @@ $(CNTKMATH_LIB): $(MATH_OBJ)
@echo $(SEPARATOR)
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
@mkdir -p $(dir $@)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp
########################################
# CNTKLibrary
@ -368,13 +375,17 @@ SEQUENCE_TRAINING_LIB_SRC +=\
endif
CNTKLIBRARY_SRC =\
$(SOURCEDIR)/CNTKv2LibraryDll/BackCompat.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/Common.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/Function.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/MinibatchSource.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/NDArrayView.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/NDMask.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/Trainer.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/Utils.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/Value.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/Variable.cpp \
$(SOURCEDIR)/CNTKv2LibraryDll/Learner.cpp \
CNTKLIBRARY_SRC+=$(CNTK_COMMON_SRC)
CNTKLIBRARY_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
@ -393,7 +404,7 @@ $(CNTKLIBRARY_LIB): $(CNTKLIBRARY_OBJ) | $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building output for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)
########################################
# CNTKLibrary tests
@ -405,6 +416,8 @@ CNTKLIBRARY_TESTS_SRC =\
Tests/UnitTests/V2LibraryTests/NDArrayViewTests.cpp \
Tests/UnitTests/V2LibraryTests/RecurrentFunctionTests.cpp \
Tests/UnitTests/V2LibraryTests/TensorTests.cpp \
Tests/UnitTests/V2LibraryTests/TrainerTests.cpp \
Tests/UnitTests/V2LibraryTests/CifarResNet.cpp \
CNTKLIBRARY_TESTS:=$(BINDIR)/v2librarytests
CNTKLIBRARY_TESTS_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_TESTS_SRC)))
@ -416,7 +429,7 @@ $(CNTKLIBRARY_TESTS): $(CNTKLIBRARY_TESTS_OBJ) | $(CNTKLIBRARY_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building output for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) -l$(CNTKMATH)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) -l$(CNTKMATH)
########################################
# LibEval
@ -437,7 +450,7 @@ EVAL_SRC=\
$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp
$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
EVAL_SRC+=$(SGDLIB_SRC)
EVAL_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
@ -450,11 +463,11 @@ EVAL_LIB:=$(LIBDIR)/lib$(EVAL).so
ALL+=$(EVAL_LIB)
SRC+=$(EVAL_SRC)
$(EVAL_LIB): $(EVAL_OBJ)
$(EVAL_LIB): $(EVAL_OBJ) | $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo Building $(EVAL_LIB) for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)
########################################
# Eval Sample client
@ -469,11 +482,11 @@ EVAL_SAMPLE_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_SAMPLE_CLIENT_SR
ALL+=$(EVAL_SAMPLE_CLIENT)
SRC+=$(EVAL_SAMPLE_CLIENT_SRC)
$(EVAL_SAMPLE_CLIENT): $(EVAL_SAMPLE_CLIENT_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
$(EVAL_SAMPLE_CLIENT): $(EVAL_SAMPLE_CLIENT_OBJ) | $(EVAL_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $(EVAL_SAMPLE_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ -l$(EVAL) -l$(CNTKMATH)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)
########################################
# BinaryReader plugin
@ -770,7 +783,6 @@ CNTK_SRC =\
$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptTest.cpp \
CNTK_SRC+=$(SGDLIB_SRC)
CNTK_SRC+=$(CNTK_COMMON_SRC)
@ -787,7 +799,7 @@ $(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building output for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp
# deployable resources: standard library of BS
CNTK_CORE_BS:=$(BINDIR)/cntk.core.bs
@ -797,6 +809,151 @@ $(CNTK_CORE_BS): $(SOURCEDIR)/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@echo bin-placing deployable resource files
cp -f $^ $@
########################################
# Unit Tests
########################################
# only build unit tests when Boost is available
ifdef BOOST_PATH
INCLUDEPATH += $(BOOST_PATH)/include
BOOSTLIB_PATH = $(BOOST_PATH)/lib
BOOSTLIBS := -lboost_unit_test_framework -lboost_filesystem -lboost_system
UNITTEST_EVAL_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/EvalTests/EvalExtendedTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/EvalTests/stdafx.cpp
UNITTEST_EVAL_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_EVAL_SRC))
UNITTEST_EVAL := $(BINDIR)/evaltests
ALL += $(UNITTEST_EVAL)
SRC += $(UNITTEST_EVAL_SRC)
$(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(EVAL) -l$(CNTKMATH)
#TODO: create project specific makefile or rules to avoid adding project specific path to the global path
INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader
UNITTEST_READER_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/HTKLMFReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ImageReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ReaderLibTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/UCIFastReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/stdafx.cpp \
$(SOURCEDIR)/Readers/CNTKTextFormatReader/Indexer.cpp \
$(SOURCEDIR)/Readers/CNTKTextFormatReader/TextParser.cpp \
UNITTEST_READER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_READER_SRC))
UNITTEST_READER := $(BINDIR)/readertests
ALL += $(UNITTEST_READER)
SRC += $(UNITTEST_READER_SRC)
$(UNITTEST_READER): $(UNITTEST_READER_OBJ) | $(HTKMLFREADER) $(HTKDESERIALIZERS) $(UCIFASTREADER) $(COMPOSITEDATAREADER) $(IMAGEREADER) $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) -l$(CNTKMATH) -ldl
UNITTEST_NETWORK_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/OperatorEvaluation.cpp \
$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/stdafx.cpp \
$(SOURCEDIR)/CNTK/ModelEditLanguage.cpp \
$(SOURCEDIR)/ActionsLib/TrainActions.cpp \
$(SOURCEDIR)/ActionsLib/EvalActions.cpp \
$(SOURCEDIR)/ActionsLib/OtherActions.cpp \
$(SOURCEDIR)/ActionsLib/SpecialPurposeActions.cpp \
$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
UNITTEST_NETWORK_SRC += $(COMPUTATION_NETWORK_LIB_SRC)
UNITTEST_NETWORK_SRC += $(CNTK_COMMON_SRC)
UNITTEST_NETWORK_SRC += $(SEQUENCE_TRAINING_LIB_SRC)
UNITTEST_NETWORK_SRC += $(SGDLIB_SRC)
UNITTEST_NETWORK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_NETWORK_SRC)))
UNITTEST_NETWORK := $(BINDIR)/networktests
ALL += $(UNITTEST_NETWORK)
SRC += $(UNITTEST_NETWORK_SRC)
$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(CNTKMATH) -fopenmp
UNITTEST_MATH_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/BlockMultiplierTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/constants.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUMatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUSparseMatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/fixtures.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixCudaBlasTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixBlasTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixDataSynchronizationTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixFileWriteReadTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixSparseDenseInteractionsTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \
UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))
UNITTEST_MATH := $(BINDIR)/mathtests
ALL += $(UNITTEST_MATH)
SRC += $(UNITTEST_MATH_SRC)
$(UNITTEST_MATH): $(UNITTEST_MATH_OBJ) | $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(CNTKMATH) -ldl -fopenmp
UNITTEST_BRAINSCRIPT_SRC = \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/ParserTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/stdafx.cpp
UNITTEST_BRAINSCRIPT_SRC+=$(COMMON_SRC)
UNITTEST_BRAINSCRIPT_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_BRAINSCRIPT_SRC))
UNITTEST_BRAINSCRIPT := $(BINDIR)/brainscripttests
ALL += $(UNITTEST_BRAINSCRIPT)
SRC += $(UNITTEST_BRAINSCRIPT_SRC)
$(UNITTEST_BRAINSCRIPT): $(UNITTEST_BRAINSCRIPT_OBJ)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -ldl
unittests: $(UNITTEST_EVAL) $(UNITTEST_READER) $(UNITTEST_NETWORK) $(UNITTEST_MATH) $(UNITTEST_BRAINSCRIPT)
endif
########################################
# General compile and dependency rules
########################################
@ -821,13 +978,13 @@ $(OBJDIR)/%.o : %.cu $(BUILD_CONFIGURATION)
@mkdir -p $(dir $@)
$(NVCC) -c $< -o $@ $(COMMON_FLAGS) $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler "-fPIC -Werror"
$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
@echo $(SEPARATOR)
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
@mkdir -p $(dir $@)
$(CXX) -c $< -o $@ $(COMMON_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}
.PHONY: clean buildall all
.PHONY: clean buildall all unittests
clean:
@echo $(SEPARATOR)

Просмотреть файл

@ -1,6 +1,11 @@
# CNTK
## Latest news
*2016-07-15.* V 1.6 Binary release
CNTK v.1.6 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
*2016-07-12.* We have further expanded Licensing options for CNTK 1bit-SGD and related components. See the details at the [Wiki page](https://github.com/microsoft/cntk/wiki/CNTK-1bit-SGD-License). These new options are an extension of the new CNTK 1bit-SGD License that we have announced on Jun 23, 2016.
*2016-07-05.* CNTK now supports *Deconvolution* and *Unpooling*. See the usage example in the Network number 4 in [MNIST Sample](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/README.md).
*2016-06-23.* New License Terms for CNTK 1bit-SGD and related components.
@ -8,12 +13,6 @@ Effective immediately the License Terms for CNTK 1bit-SGD and related components
*2016-06-20.* A [post](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/) on Intel MKL and CNTK is published in the [Intel IT Peer Network](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/)
*2016-06-16.* V 1.5 Binary release. NuGet Package with CNTK Model Evaluation Libraries.
NuGet Package is added to CNTK v.1.5 binaries. See [CNTK Releases page](https://github.com/Microsoft/CNTK/releases) and [NuGet Package description](https://github.com/Microsoft/CNTK/wiki/Nuget-Package-for-Evaluation).
*2016-06-15.* CNTK now supports building against a custom Intel® Math Kernel Library (MKL).
See [setup instructions](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine) on how to set this up for your platform.
See [all news](https://github.com/Microsoft/CNTK/wiki/News).
## What is CNTK

24
Scripts/README.md Normal file
Просмотреть файл

@ -0,0 +1,24 @@
This directory contains different script helping using different components of CNTK.
### CNTK Text format Converters
Two Python Scripts for converting Data to CNTK Text format for using as an input for CNTK Text Format Reader (see https://github.com/microsoft/cnTK/wiki/CNTKTextFormat-Reader).
```
txt2ctf.py
```
Converts a set of dictionary files and a plain text file to CNTK Text format. Run ```python txt2ctf.py -h``` to see usage instructions. See the comments in the beginning of the script file for the specific usage example.
```
uci2ctf.py
```
Converts data stored in a text file in UCI format to CNTK Text format. Run ```python uci2ctf.py -h``` to see usage instructions and example. Also see a usage example below:
```
python Scripts/uci2ctf.py --input_file Examples/Image/MNIST/Data/Train-28x28.txt --features_start 1 --features_dim 784 --labels_start 0 --labels_dim 1 --num_labels 10 --output_file Examples/Image/MNIST/Data/Train-28x28_cntk_text.txt
```
```input_file``` – original dataset in the (columnar) UCI format
```features_start``` – index of the first feature column (start parameter in the UCIFastReader config, see https://github.com/Microsoft/CNTK/wiki/UCI-Fast-Reader)
```features_dim``` – number of feature columns (dim parameter in the UCIFastReader config)
```labels_start``` - index of the first label column
```labels_dim``` – number of label columns
```num_labels``` – number of possible label values (labelDim parameter in the UCIFastReader config)
```output_file``` – path and filename of the resulting dataset.

@ -1 +1 @@
Subproject commit c9821dd5565d4654841eaba819b655c9db2fe85b
Subproject commit f7afb8c6a08a6652d84de1b62377175788be5284

Просмотреть файл

@ -149,11 +149,11 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
int forcedRandomSeed = node->GetOptionalParameter("randomSeed", "-1" /*disabled*/);
if (EqualCI(initString, L"fixedValue"))
nodePtr->Value().SetValue(value);
m_net->InitLearnableParameters(nodePtr, L"fixedValue", value);
else if (EqualCI(initString, L"uniform"))
m_net->InitLearnableParameters(nodePtr, true, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long) forcedRandomSeed, initValueScale, initOnCPUOnly);
m_net->InitLearnableParameters(nodePtr, L"uniform", initValueScale, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed, initOnCPUOnly);
else if (EqualCI(initString, L"gaussian"))
m_net->InitLearnableParameters(nodePtr, false, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long) forcedRandomSeed, initValueScale, initOnCPUOnly);
m_net->InitLearnableParameters(nodePtr, L"gaussian", initValueScale, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed, initOnCPUOnly);
else if (EqualCI(initString, L"fromFile"))
{
std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
@ -167,7 +167,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
dynamic_pointer_cast<LearnableParameter<ElemType>>(nodePtr)->InitFromFile(msra::strfun::utf16(initFromFilePath));
}
else
RuntimeError("'init' must be one of the values of [ uniform | gaussian | fixedValue ]");
RuntimeError("'init' must be one of the values of [ uniform | gaussian | fixedValue | fromFile ]");
}
}
else if (cnNodeType == L"Constant")
@ -186,7 +186,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
else if (pass == ndlPassFinal || nodePtr->Value().GetNumElements() != 0)
{
ElemType val = parameter[0]->GetScalar();
nodePtr->Value().SetValue(val);
m_net->InitLearnableParameters(nodePtr, L"fixedValue", val);
}
}
else if (cnNodeType == L"RowSlice") // Note: This now maps onto SliceNode which specifies the end differently.
@ -304,7 +304,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
"1. 2D convolution which takes 7 fixed parameters [weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample] \n"
"and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"HWC\"|\"cudnn\"]. \n"
"2. ND convolution which takes 3 fixed parameters [weightNodeName, inputValueNodeName, kernelShape] and \n"
"10 optional parameters [mapCount = [1|yourvalue], stride = [1|yourvalue], sharing = [true|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], bool transpose = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
"10 optional parameters [mapCount = [0|yourvalue], stride = [1|yourvalue], sharing = [true|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], bool transpose = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
"For ND convolution, parameters kernelShape, mapCount, stride, sharing, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
cnNodeType.c_str(), cnNodeType.c_str());
}
@ -380,7 +380,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
};
auto kernelShape = paramGetter(reqParams.size() - 1);
auto mapCount = paramResolver("mapCount", 1);
auto mapCount = paramResolver("mapCount", 0);
auto stride = paramResolver("stride", 1);
auto sharing = boolParamResolver("sharing", true);
auto autoPad = boolParamResolver("autoPadding", true);

Просмотреть файл

@ -158,12 +158,12 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
else if (EqualInsensitive(nodeType, OperationNameOf(CRFNode), L"CRF")) ret = true;
#endif
else if (EqualInsensitive(nodeType, OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode), L"CBCEWithSM")) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(EqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterEqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LessEqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LessNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(NotEqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(EqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterEqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(GreaterNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LessEqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LessNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(NotEqualNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(ClipNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(ConvolutionNode), L"Convolve")) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true;
@ -263,4 +263,5 @@ template class NDLNode<double>;
template class NDLScript<float>;
template class NDLScript<double>;
} } }
}}}

Просмотреть файл

@ -95,8 +95,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildFFDNNFromDescription(
if (numHiddenLayers > 0)
{
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[0]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
b = builder.CreateLearnableParameter(L"B0", m_layerSizes[1], 1);
m_net->InitLearnableParameters(b, L"fixedValue", 0);
output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, 1, L"W0*features"), b, L"W0*features+B0"), 0, L"H1");
if (m_addDropoutNodes)
@ -114,8 +115,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildFFDNNFromDescription(
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[i + 1], m_layerSizes[i]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[i + 1], 1);
m_net->InitLearnableParameters(b, L"fixedValue", 0);
output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus), i, nameOfH);
if (m_addDropoutNodes)
@ -132,8 +134,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildFFDNNFromDescription(
wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[numHiddenLayers + 1], 1);
m_net->InitLearnableParameters(b, L"fixedValue", 0);
output = builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus);
m_net->RenameNode(output, L"HLast");
@ -198,12 +201,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildRNNFromDescription()
{
// TODO: to figure out sparse matrix size
u = builder.CreateLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0]);
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == 1)
{
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
// unless there is a good algorithm to detect loops, use this explicit setup
@ -230,12 +233,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildRNNFromDescription()
{
// TODO: to figure out sparse matrix size
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
{
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t) m_layerSizes[i + 1], 1);
// unless there is a good algorithm to detect loops, use this explicit setup
@ -259,7 +262,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildRNNFromDescription()
}
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
/*m_net->MatrixL2Reg(w , L"L1w");*/
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
@ -311,12 +314,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
if (numHiddenLayers > 0)
{
u = builder.CreateLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0]);
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == 1)
{
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
// unless there is a good algorithm to detect loops, use this explicit setup
@ -330,7 +333,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
else
{
b = builder.CreateLearnableParameter(L"B0", m_layerSizes[1], 1);
m_net->InitLearnableParameters(b, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(b, m_uniformInit, randomSeed++, m_initValueScale);
output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), 0);
}
@ -342,11 +345,11 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
for (int i = 1; i < numHiddenLayers; i++)
{
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
{
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t) m_layerSizes[i + 1], 1);
// unless there is a good algorithm to detect loops, use this explicit setup
@ -373,13 +376,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
// e.g., [200 x 10000], where 10000 is the vocabulary size
// this is for speed-up issue as per word matrix can be simply obtained using column slice
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
// the label is a dense matrix. each element is the word index
label = builder.CreateInputNode(L"labels", 4);
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");
output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
@ -428,7 +431,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetwor
if (m_lookupTableOrder > 0)
{
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
output = builder.LookupTable(e, input, L"LookupTable");
if (m_addDropoutNodes)
@ -464,9 +467,8 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetwor
// serve as a global bias term
gt = builder.CreateInputNode(L"binaryFeature", m_auxFeatDim);
m_net->AddToNodeGroup(L"feature", gt);
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"AuxTrans%d", 0),
m_layerSizes[numHiddenLayers], m_auxFeatDim);
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"AuxTrans%d", 0), m_layerSizes[numHiddenLayers], m_auxFeatDim);
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
u = ApplyNonlinearFunction(builder.Times(e, gt), numHiddenLayers, L"TimesToGetGlobalBias");
output = builder.Plus(input, u, L"PlusGlobalBias");
input = output;
@ -475,13 +477,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetwor
// e.g., [200 x 10000], where 10000 is the vocabulary size
// this is for speed-up issue as per word matrix can be simply obtained using column slice
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
// the label is a dense matrix. each element is the word index
label = builder.CreateInputNode(L"labels", 4);
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");
output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
@ -535,7 +537,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
if (m_lookupTableOrder > 0)
{
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
output = builder.LookupTable(e, input, L"Lookuptatble");
if (m_addDropoutNodes)
@ -556,7 +558,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
pastValueXI->AttachInputs({ input });
// TODO: to figure out sparse matrix size
Wxi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"DD%d", ik), m_layerSizes[0], m_layerSizes[0]);
m_net->InitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
it = builder.Plus(output, builder.Times(Wxi, pastValueXI));
output = it;
@ -572,13 +574,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
for (int i = m_lookupTableOrder > 0 ? 1 : 0; i < numHiddenLayers; i++)
{
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i] * (m_lookupTableOrder > 0 ? m_lookupTableOrder : 1));
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
output = builder.Times(u, input);
input = output;
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
{
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"R%d", i + 1), m_layerSizes[i + 1], m_layerSizes[i + 1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i + 1], 1);
output = builder.Plus(builder.Times(w, pastValue), input);
@ -588,6 +590,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
}
bi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bi%d", i), m_layerSizes[i + 1], 1);
m_net->InitLearnableParameters(bi, L"fixedValue", 0);
output = builder.Plus(input, bi);
if (m_addDropoutNodes)
@ -597,7 +600,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
}
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
AddTrainAndEvalCriterionNodes(input, label, w);
@ -650,6 +653,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr
if (numHiddenLayers > 0)
{
bi = builder.CreateLearnableParameter(L"bi0", m_layerSizes[1], 1);
m_net->InitLearnableParameters(bi, L"fixedValue", 0);
pastValueXI = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 1);
pastValueXII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 2);
@ -664,19 +668,19 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr
{
// TODO: to figure out sparse matrix size
Wxi2 = builder.CreateLearnableParameter(L"WXI2", m_layerSizes[1], m_layerSizes[0]);
m_net->InitLearnableParameters(Wxi2, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wxi2, m_uniformInit, randomSeed++, m_initValueScale);
// TODO: to figure out sparse matrix size
Wxi3 = builder.CreateLearnableParameter(L"WXI3", m_layerSizes[1], m_layerSizes[0]);
m_net->InitLearnableParameters(Wxi3, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wxi3, m_uniformInit, randomSeed++, m_initValueScale);
// TODO: to figure out sparse matrix size
Wxi4 = builder.CreateLearnableParameter(L"WXI4", m_layerSizes[1], m_layerSizes[0]);
m_net->InitLearnableParameters(Wxi4, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wxi4, m_uniformInit, randomSeed++, m_initValueScale);
// TODO: to figure out sparse matrix size
Wxi1 = builder.CreateLearnableParameter(L"WXI1", m_layerSizes[1], m_layerSizes[0]);
m_net->InitLearnableParameters(Wxi1, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wxi1, m_uniformInit, randomSeed++, m_initValueScale);
// TODO: to figure out sparse matrix size
Wxi = builder.CreateLearnableParameter(L"WXI", m_layerSizes[1], m_layerSizes[0]);
m_net->InitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
// unless there is a good algorithm to detect loops, use this explicit setup
it = builder.Plus(
@ -711,11 +715,11 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr
for (int i = 1; i < numHiddenLayers; i++)
{
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
{
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
std::list<ComputationNodeBasePtr> recurrent_loop;
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i + 1], 1);
output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), builder.Times(w, pastValue)), i);
@ -736,8 +740,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr
// TODO: to figure out sparse matrix size
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
// b = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"B%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], 1);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
//b = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"B%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], 1);
//m_net->InitLearnableParameters(b, L"fixedValue", 0);
label = builder.CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
AddTrainAndEvalCriterionNodes(input, label, w);
@ -766,11 +771,11 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
if (m_directConnect[i] == iLayer)
{
ComputationNodePtr directWIO = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"D%d", i), outputDim, inputDim);
m_net->InitLearnableParameters(directWIO, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(directWIO, m_uniformInit, randomSeed++, m_initValueScale);
directOutput = ApplyNonlinearFunction(builder.Times(directWIO, input), i);
ComputationNodePtr scalar = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"SV%d", i), 1, 1);
scalar->Value().SetValue((ElemType) 0.01);
m_net->InitLearnableParameters(scalar, L"fixedValue", (ElemType) 0.01);
ComputationNodePtr scaled = builder.ElementTimes(scalar, directOutput, msra::strfun::wstrprintf(L"S%d", i));
mergedNode = builder.Plus(toNode, scaled);
@ -801,39 +806,38 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
Wxf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WXF%d", iLayer), outputDim, inputDim);
Wxc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WXC%d", iLayer), outputDim, inputDim);
m_net->InitLearnableParameters(Wxo, m_uniformInit, randomSeed++, m_initValueScale);
m_net->InitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
m_net->InitLearnableParameters(Wxf, m_uniformInit, randomSeed++, m_initValueScale);
m_net->InitLearnableParameters(Wxc, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wxo, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wxf, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wxc, m_uniformInit, randomSeed++, m_initValueScale);
bo = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bo%d", iLayer), outputDim, 1);
bc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bc%d", iLayer), outputDim, 1);
bi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bi%d", iLayer), outputDim, 1);
bf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bf%d", iLayer), outputDim, 1);
// if (m_forgetGateInitVal > 0)
bf->Value().SetValue(m_forgetGateInitVal);
// if (m_inputGateInitVal > 0)
bi->Value().SetValue(m_inputGateInitVal);
// if (m_outputGateInitVal > 0)
bo->Value().SetValue(m_outputGateInitVal);
m_net->InitLearnableParameters(bi, L"fixedValue", m_inputGateInitVal);
m_net->InitLearnableParameters(bc, L"fixedValue", 0);
m_net->InitLearnableParameters(bo, L"fixedValue", m_outputGateInitVal);
m_net->InitLearnableParameters(bf, L"fixedValue", m_forgetGateInitVal);
Whi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHI%d", iLayer), outputDim, outputDim);
m_net->InitLearnableParameters(Whi, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Whi, m_uniformInit, randomSeed++, m_initValueScale);
Wci = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCI%d", iLayer), outputDim, 1);
m_net->InitLearnableParameters(Wci, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wci, m_uniformInit, randomSeed++, m_initValueScale);
Whf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHF%d", iLayer), outputDim, outputDim);
m_net->InitLearnableParameters(Whf, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Whf, m_uniformInit, randomSeed++, m_initValueScale);
Wcf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCF%d", iLayer), outputDim, 1);
m_net->InitLearnableParameters(Wcf, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wcf, m_uniformInit, randomSeed++, m_initValueScale);
Who = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHO%d", iLayer), outputDim, outputDim);
m_net->InitLearnableParameters(Who, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Who, m_uniformInit, randomSeed++, m_initValueScale);
Wco = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCO%d", iLayer), outputDim, 1);
m_net->InitLearnableParameters(Wco, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Wco, m_uniformInit, randomSeed++, m_initValueScale);
Whc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHC%d", iLayer), outputDim, outputDim);
m_net->InitLearnableParameters(Whc, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(Whc, m_uniformInit, randomSeed++, m_initValueScale);
size_t layer1 = outputDim;
@ -848,8 +852,8 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
if (m_constInputGateValue)
{
// it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim);
// m_net->InitLearnableParameters(it, L"fixedValue", m_constInputGateValue);
// it->SetLearningRateMultiplier(0);
// it->Value().SetValue(m_constInputGateValue);
it = nullptr;
}
else
@ -988,7 +992,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCRFLSTMNetworkFromDes
if (m_lookupTableOrder > 0)
{
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
output = builder.LookupTable(e, input, L"LookupTable");
if (m_addDropoutNodes)
@ -1017,8 +1021,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCRFLSTMNetworkFromDes
else
{
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i] * (offset ? m_lookupTableOrder : 1));
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
m_net->InitLearnableParameters(b, L"fixedValue", 0);
output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
}
@ -1030,13 +1035,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCRFLSTMNetworkFromDes
}
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"TimesBeforeSoftMax%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
output = builder.Times(w, input, L"outputsBeforeSoftmax");
trans = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"TransProb%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers + 1]);
trans->Value().SetValue((ElemType) 1.0 / m_layerSizes[numHiddenLayers + 1]);
// m_net->InitLearnableParameters(trans, m_uniformInit, randomSeed++, m_initValueScale);
m_net->InitLearnableParameters(trans, L"fixedValue", (ElemType) 1.0 / m_layerSizes[numHiddenLayers + 1]);
// m_net->RandomInitLearnableParameters(trans, m_uniformInit, randomSeed++, m_initValueScale);
trans->SetLearningRateMultiplier(1.0f);
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
AddTrainAndEvalCriterionNodes(output, label, nullptr, L"CRFTrainCriterion", L"CRFEvalCriterion", nullptr, trans);
@ -1085,7 +1090,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassLSTMNetworkFromD
if (m_lookupTableOrder > 0)
{
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
output = builder.LookupTable(e, input, L"LookupTable");
if (m_addDropoutNodes)
@ -1122,13 +1127,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassLSTMNetworkFromD
// e.g., [200 x 10000], where 10000 is the vocabulary size
// this is for speed-up issue as per word matrix can be simply obtained using column slice
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
// the label is a dense matrix. each element is the word index
label = builder.CreateInputNode(L"labels", 4);
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");
output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
@ -1164,16 +1169,16 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
input = inputObs;
size_t nDim = inputDim + outputDim + 2;
wInputGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WINPUTGATE%d", iLayer), outputDim, nDim);
m_net->InitLearnableParameters(wInputGate, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(wInputGate, m_uniformInit, randomSeed++, m_initValueScale);
wInputGate->Value().ColumnSlice(0, 1).SetValue(m_inputGateInitVal); // init to input gate bias
wForgetGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WFORGETGATE%d", iLayer), outputDim, nDim);
m_net->InitLearnableParameters(wForgetGate, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(wForgetGate, m_uniformInit, randomSeed++, m_initValueScale);
wForgetGate->Value().ColumnSlice(0, 1).SetValue(m_forgetGateInitVal); // init to forget gate bias
wOutputGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WOUTPUTGATE%d", iLayer), outputDim, nDim);
m_net->InitLearnableParameters(wOutputGate, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(wOutputGate, m_uniformInit, randomSeed++, m_initValueScale);
wOutputGate->Value().ColumnSlice(0, 1).SetValue(m_outputGateInitVal); // init to output gate bias
wMemoryCellMatrix = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WMEMORYCELLWEIGHT%d", iLayer), outputDim, inputDim + outputDim + 1);
m_net->InitLearnableParameters(wMemoryCellMatrix, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(wMemoryCellMatrix, m_uniformInit, randomSeed++, m_initValueScale);
wMemoryCellMatrix->Value().ColumnSlice(0, 1).SetValue(0); // init to memory cell bias
output = builder.LSTM(inputObs, wInputGate, wForgetGate, wOutputGate, wMemoryCellMatrix, msra::strfun::wstrprintf(L"LSTM%d", iLayer));
@ -1234,7 +1239,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescri
if (m_lookupTableOrder > 0)
{
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
output = builder.LookupTable(e, input, L"LookupTable");
#ifdef DEBUG_DECODER
e->Value().SetValue((ElemType) 0.01);
@ -1275,8 +1280,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescri
else
{
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
m_net->InitLearnableParameters(b, L"fixedValue", 0);
output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
}
@ -1290,7 +1296,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescri
}
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
#ifdef DEBUG_DECODER
w->Value().SetValue((ElemType) 0.01);
#endif
@ -1349,7 +1355,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDes
if (m_lookupTableOrder > 0)
{
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
output = builder.LookupTable(e, input, L"LookupTable");
if (m_addDropoutNodes)
@ -1381,8 +1387,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDes
else
{
u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
m_net->InitLearnableParameters(b, L"fixedValue", 0);
output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
}
@ -1407,14 +1414,14 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDes
// e.g., [200 x 10000], where 10000 is the vocabulary size
// this is for speed-up issue as per word matrix can be simply obtained using column slice
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
// the label is a dense matrix. each element is the word index
label = builder.CreateInputNode(L"labels", 2 * (this->nce_noises + 1));
bias = builder.CreateLearnableParameter(L"BiasVector", 1, m_layerSizes[m_layerSizes.size() - 1]);
bias->Value().SetValue((ElemType) -std::log(m_layerSizes[m_layerSizes.size() - 1]));
// m_net->InitLearnableParameters(bias, m_uniformInit, randomSeed++, std::log(m_layerSizes[m_layerSizes.size() - 1])* m_initValueScale);
m_net->InitLearnableParameters(bias, L"fixedValue", (ElemType) -std::log(m_layerSizes[m_layerSizes.size() - 1]));
// m_net->RandomInitLearnableParameters(bias, m_uniformInit, randomSeed++, std::log(m_layerSizes[m_layerSizes.size() - 1])* m_initValueScale);
// clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");
output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeNCEBasedCrossEntropy", L"EvalNodeNCEBasedCrossEntrpy", bias);
@ -1525,10 +1532,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(co
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
w = builder.CreateLearnableParameter(nameOfW, wts.GetNumRows(), wts.GetNumCols());
w->Value().SetValue(wts);
m_net->InitLearnableParameters(w, L"fixedValue", 0); // follow protocol
w->Value().SetValue(wts); // and overwrite
b = builder.CreateLearnableParameter(nameOfB, bias.GetNumRows(), 1);
b->Value().SetValue(bias);
m_net->InitLearnableParameters(b, L"fixedValue", 0); // follow protocol
b->Value().SetValue(bias); // and overwrite
if (layerType == "perceptron")
{
@ -1588,8 +1597,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(co
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
w = builder.CreateLearnableParameter(nameOfW, outputLayerSize, penultimateSize);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
b = builder.CreateLearnableParameter(nameOfB, outputLayerSize, 1);
m_net->InitLearnableParameters(b, L"fixedValue", 0);
output = builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus);
m_net->RenameNode(output, L"HLast");

Просмотреть файл

@ -53,7 +53,6 @@ public:
__declspec_noreturn static inline void EvaluationError(const wstring &msg, TextLocation where)
{
//Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
throw EvaluationException(msg, where);
}
@ -536,8 +535,13 @@ static ConfigValuePtr Evaluate(const ExpressionPtr &e, const IConfigRecordPtr &s
}
return ConfigValuePtr(make_shared<ConfigLambda>(move(paramNames), move(namedParams), f), MakeFailFn(e->location), exprPath);
}
else if (e->op == L"(") // === apply a function to its arguments
else if (e->op == L"(" || e->op == L"{") // === apply a function to its arguments
{
// Note: "{" is experimental and currently ignored as a distinction. To do it more completely, we need
// - remember how a function was declared (currently not possible for lambdas)
// - make sure the invocation matches declaration
// - disallow calling Parameter() or any other creating functions as "()"
// - disallow calling "{}"-declared functions from inside a "()"
let &lambdaExpr = e->args[0]; // [0] = function
let &argsExpr = e->args[1]; // [1] = arguments passed to the function ("()" expression of expressions)
let lambda = AsPtr<ConfigLambda>(Evaluate(lambdaExpr, scope, exprPath, L"" /*macros are not visible in expression names*/), lambdaExpr, L"function");
@ -848,8 +852,8 @@ static wstring FormatConfigValue(ConfigValuePtr arg, const wstring &how)
{
let arr = arg.AsPtr<ConfigArray>();
wstring result;
let range = arr->GetIndexRange();
for (int i = range.first; i <= range.second; i++)
let range = arr->GetIndexBeginEnd();
for (int i = range.first; i < range.second; i++)
{
if (i > range.first)
result.append(L"\n");
@ -890,20 +894,20 @@ public:
else // otherwise expect an array
{
let & arr = arg.AsRef<ConfigArray>();
let range = arr.GetIndexRange();
us = (double)(range.second + 1 - range.first);
let range = arr.GetSize(arg.GetFailFn());
us = (double)range;
}
}
}
else if (what == L"Mod" || what == L"IntDiv") //two-arg int functions
else if (what == L"Mod" || what == L"IntDiv") // two-arg int functions
{
let argsArg = config[L"args"];
let& args = argsArg.AsRef<ConfigArray>();
auto range = args.GetIndexRange();
if (range.second != range.first + 1)
auto range = args.GetIndexBeginEnd();
if (range.second != range.first + 2)
argsArg.Fail(L"Mod/IntDiv expects two arguments");
let arg1 = (int)args.At(range.first);
let arg2 = (int)args.At(range.second);
let arg2 = (int)args.At(range.first + 1);
if (what == L"Mod")
us = (int)(arg1 % arg2);
@ -918,6 +922,7 @@ public:
// CompareFunctions
// - IsSameObject()
// - IsArray()
class CompareFunction : public BoxOf<Bool>
{
public:
@ -932,13 +937,17 @@ public:
if (what == L"IsSameObject")
{
let& args = argsArg.AsRef<ConfigArray>();
auto range = args.GetIndexRange();
if (range.second != range.first+1)
auto range = args.GetIndexBeginEnd();
if (range.second != range.first + 2)
argsArg.Fail(L"IsSameObject expects two arguments");
let arg1 = args.At(range.first ).AsPtr<Object>();
let arg2 = args.At(range.second).AsPtr<Object>();
let arg1 = args.At(range.first ).AsPtr<Object>();
let arg2 = args.At(range.first + 1).AsPtr<Object>();
us = arg1.get() == arg2.get();
}
else if (what == L"IsArray")
{
us = argsArg.Is<ConfigArray>();
}
else
whatArg.Fail(L"Unknown 'what' value to CompareFunction: " + what);
}

Просмотреть файл

@ -22,6 +22,4 @@ ConfigValuePtr Evaluate(ExpressionPtr); // evaluat
void Do(ExpressionPtr e); // evaluate e.do
shared_ptr<Object> EvaluateField(ExpressionPtr e, const wstring& id); // for experimental CNTK integration
// some simple tests
void SomeTests();
} } } // end namespaces
}}} // end namespaces

Просмотреть файл

@ -13,6 +13,7 @@
#include <set>
#include <stdexcept>
#include <algorithm>
#include <iomanip>
#ifndef let
#define let const auto
@ -89,9 +90,18 @@ struct Issue
// Because it is often hard to recognize an issue only from the point where it occurred, we also report the history in compact visual form.
// Since often multiple contexts are on the same source line, we only print each source line once in a consecutive row of contexts.
/*static*/ void TextLocation::PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
{
wstring error = CreateIssueMessage(locations, errorKind, kind, what);
fprintf(stderr, "%ls", error.c_str());
fflush(stderr);
}
/*static*/ wstring TextLocation::CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
{
vector<Issue> issues; // tracing the error backwards
size_t symbolIndex = 0;
wstring message;
for (size_t n = 0; n < locations.size(); n++)
{
let& location = locations[n];
@ -125,20 +135,23 @@ struct Issue
if (!locations.empty()) // (be resilient to some throwers not having a TextLocation; to be avoided)
{
let& firstLoc = issues.front().location;
fprintf(stderr, "[CALL STACK]\n");
message += wstrprintf(L"[CALL STACK]\n");
for (auto i = issues.rbegin(); i != issues.rend(); i++)
{
let& issue = *i;
auto& where = issue.location;
const auto& lines = where.GetSourceFile().lines;
const auto line = (where.lineNo == lines.size()) ? L"(end)" : lines[where.lineNo].c_str();
fprintf(stderr, " %ls\n %ls\n", line, issue.markup.c_str());
message += wstrprintf(L" %ls\n %ls\n", line, issue.markup.c_str());
}
fprintf(stderr, "%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
message += wstrprintf(L"%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
}
else
fprintf(stderr, "%ls while %ls", errorKind, kind);
fprintf(stderr, ": %ls\n", what), fflush(stderr);
{
message += wstrprintf(L"%ls while %ls", errorKind, kind);
}
message += wstrprintf(L": %ls\n", what);
return message;
}
/*static*/ vector<SourceFile> TextLocation::sourceFileMap;
@ -286,7 +299,7 @@ public:
};
punctuations = set<wstring>{
L"=", L";", L",", L"\n",
L"[", L"]", L"(", L")",
L"[", L"]", L"(", L")", L"{", L"}", L"[|", L"|]",
L"+", L"-", L"*", L"/", L"**", L".*", L"%", L"||", L"&&", L"^",
L"!",
L"==", L"!=", L"<", L"<=", L">", L">=",
@ -557,37 +570,43 @@ public:
// ---------------------------------------------------------------------------
// diagnostics helper: print the content
void Expression::Dump(int indent) const
void Expression::DumpToStream(wstringstream & treeStream, int indent)
{
fprintf(stderr, "%*s", indent, "");
treeStream << std::setfill(L' ') << std::setw(indent) << L" ";
treeStream << std::setw(0);
if (op == L"s")
fprintf(stderr, "'%ls' ", s.c_str());
treeStream << "'" << s.c_str() << "'";
else if (op == L"d")
fprintf(stderr, "%.f ", d);
treeStream << std::fixed << std::setprecision(0) << d;
else if (op == L"b")
fprintf(stderr, "%s ", b ? "true" : "false");
treeStream << b ? "true" : "false";
else if (op == L"id")
fprintf(stderr, "%ls ", id.c_str());
treeStream << id.c_str();
else if (op == L"new" || op == L"array" || op == L".")
fprintf(stderr, "%ls %ls ", op.c_str(), id.c_str());
treeStream << op.c_str() << " " << id.c_str();
else
fprintf(stderr, "%ls ", op.c_str());
treeStream << op.c_str();
if (!args.empty())
{
fprintf(stderr, "\n");
treeStream << std::endl;
for (const auto& arg : args)
arg->Dump(indent + 2);
{
arg->DumpToStream(treeStream, indent + 1);
}
}
if (!namedArgs.empty())
{
fprintf(stderr, "\n");
treeStream << std::endl;
for (const auto& arg : namedArgs)
{
fprintf(stderr, "%*s%ls =\n", indent + 2, "", arg.first.c_str());
arg.second.second->Dump(indent + 4);
treeStream << std::setfill(L' ') << std::setw(indent + 1) << L"";
treeStream << arg.first.c_str() << L" =" << std::endl;
arg.second.second->DumpToStream(treeStream, indent + 2);
}
}
fprintf(stderr, "\n");
treeStream << std::endl;
}
class Parser : public Lexer
@ -647,13 +666,15 @@ class Parser : public Lexer
return id;
}
map<wstring, int> infixPrecedence; // precedence level of infix operators
map<wstring, int> infixPrecedence; // precedence level of infix operators
static const int unaryPrecedence = 90; // for unary "-" and "!". 90 is below x., x[, x(, and x{, but above all others
// TODO: Would be more direct to fold this into the table below as well.
public:
Parser(SourceFile&& sourceFile, vector<wstring>&& includePaths)
: Lexer(move(includePaths))
{
infixPrecedence = map<wstring, int>{
{L".", 99}, {L"[", 99}, {L"(", 99}, // also sort-of infix operands...
{L".", 99}, {L"[", 99}, {L"(", 99}, {L"{", 99}, // (with LHS) these are also sort-of infix operands...
{L"*", 10}, {L"/", 10}, {L".*", 10}, {L"**", 10}, {L"%", 10},
{L"+", 9}, {L"-", 9}, {L"with", 9}, {L"==", 8},
{L"!=", 8}, {L"<", 8}, {L"<=", 8}, {L">", 8}, {L">=", 8},
@ -700,7 +721,7 @@ public:
{
operand = make_shared<Expression>(tok.beginLocation, tok.symbol + L"("); // encoded as +( -( !(
ConsumeToken();
operand->args.push_back(ParseExpression(100, stopAtNewline));
operand->args.push_back(ParseExpression(unaryPrecedence, stopAtNewline));
}
else if (tok.symbol == L"new") // === new class instance
{
@ -723,13 +744,34 @@ public:
operand = ParseExpression(0, false /*go across newlines*/); // just return the content of the parens (they do not become part of the expression tree)
ConsumePunctuation(L")");
}
else if (tok.symbol == L"[") // === dictionary constructor
else if (tok.symbol == L"{" || tok.symbol == L"["/*soon to be deprecated*/) // === record constructor
{
let* closeSymbol = tok.symbol == L"{" ? L"}" : L"]";
operand = make_shared<Expression>(tok.beginLocation, L"[]");
ConsumeToken();
operand->namedArgs = ParseRecordMembers();
ConsumePunctuation(L"]");
ConsumePunctuation(closeSymbol);
}
#if 1 // the F# syntax is a stop-gap and meant for experimentation, and we will not recommend to use it
// Rather, we must find a way to parse both Python-like array literals and BS dictionaries jointly,
// and eventually deprecate [] for records.
else if (tok.symbol == L"[|") // === array literal using F# syntax [| a; b; c |] (same as a:b:c, but also allows for 0- and 1-element arrays)
{
operand = make_shared<Expression>(tok.beginLocation, L":");
ConsumeToken();
if (GotToken().symbol != L"|]") // {} defines an empty array
{
for (;;)
{
operand->args.push_back(ParseExpression(0, false)); // item. Precedence 0 means go until comma or closing parenthesis.
if (GotToken().symbol != L";")
break;
ConsumeToken();
}
}
ConsumePunctuation(L"|]");
}
#endif
else if (tok.symbol == L"array") // === array constructor
{
operand = OperandFromTokenSymbol(tok);
@ -780,18 +822,18 @@ public:
if (left->op != L"id") // currently only allow for a single argument
Expected(L"identifier");
ConsumeToken();
let macroArgs = make_shared<Expression>(left->location, L"()", left); // wrap identifier in a '()' macro-args expression
let macroArgs = make_shared<Expression>(left->location, L"()", left); // wrap identifier in a "()" macro-args expression
// TODO: test parsing of i => j => i*j
let body = ParseExpression(opPrecedence, stopAtNewline); // pass same precedence; this makes '=>' right-associative e.g.i=>j=>i*j
operation->args[0] = macroArgs; // [0]: parameter list
operation->args.push_back(body); // [1]: right operand
}
else if (op == L"(") // === macro application
else if (op == L"(" || op == L"{") // === macro application
{
// op = "(" means 'apply'
// op = "(" and "{" mean 'apply', where {} refers to experimental constructor syntax
// args[0] = lambda expression (lambda: op="=>", args[0] = param list, args[1] = expression with unbound vars)
// args[1] = arguments (arguments: op="(), args=vector of expressions, one per arg; and namedArgs)
operation->args.push_back(ParseMacroArgs(false)); // [1]: all arguments
// args[1] = arguments (arguments: op="()", args=vector of expressions, one per arg; and namedArgs)
operation->args.push_back(ParseMacroArgs(false, op)); // [1]: all arguments
}
else if (op == L"[") // === array index
{
@ -829,11 +871,12 @@ public:
// In case of macro definition, all arguments must be of type "id". Pass 'defining' to check for that.
// namedArgs = dictionary of optional args
// In case of macro definition, dictionary values are default values that are used if the argument is not given
ExpressionPtr ParseMacroArgs(bool defining)
ExpressionPtr ParseMacroArgs(bool defining, wstring openSymbol)
{
ConsumePunctuation(L"(");
ConsumePunctuation(openSymbol.c_str());
auto macroArgs = make_shared<Expression>(GotToken().beginLocation, L"()");
if (GotToken().symbol != L")") // x() defines an empty argument list
let* closeSymbol = openSymbol == L"(" ? L")" : L"}";
if (GotToken().symbol != closeSymbol) // x() defines an empty argument list
{
for (;;)
{
@ -856,7 +899,7 @@ public:
ConsumeToken();
}
}
ConsumePunctuation(L")");
ConsumePunctuation(closeSymbol);
return macroArgs;
}
map<wstring, pair<TextLocation, ExpressionPtr>> ParseRecordMembers()
@ -865,7 +908,7 @@ public:
// member identifier -> expression
// Macro declarations are translated into lambdas, e.g.
// F(A,B) = expr(A,B)
// gets represented in the dictionary as
// (and likewise F{A,B}) gets represented in the dictionary as
// F = (A,B) => expr(A,B)
// where a lambda expression has this structure:
// op="=>"
@ -897,7 +940,8 @@ public:
ConsumePunctuation(L"]");
}
// optional macro args
let parameters = (GotToken().symbol == L"(") ? ParseMacroArgs(true /*defining*/) : ExpressionPtr(); // optionally, macro arguments
let& openParen = GotToken().symbol;
let parameters = (openParen == L"(" || openParen == L"{") ? ParseMacroArgs(true /*defining*/, openParen) : ExpressionPtr(); // optionally, macro arguments
ConsumePunctuation(L"=");
auto rhs = ParseExpression(0, true /*can end at newline*/); // and the right-hand side
// if macro then rewrite it as an assignment of a lambda expression
@ -907,7 +951,8 @@ public:
if (arrayIndexExpr)
{
// create a lambda expression over the index variable
let macroArgs = make_shared<Expression>(arrayIndexExpr->location, L"()", arrayIndexExpr); // wrap identifier in a '()' macro-args expression
// BUGBUG: For {} constructor functions--we cannot declare constructor lambdas for now.
let macroArgs = make_shared<Expression>(arrayIndexExpr->location, L"()", arrayIndexExpr); // wrap identifier in a "()" macro-args expression
let initLambdaExpr = make_shared<Expression>(arrayIndexExpr->location, L"=>", macroArgs, rhs); // [0] is id, [1] is body
rhs = make_shared<Expression>(location, L"array");
rhs->args.push_back(fromExpr); // [0] first index
@ -939,12 +984,6 @@ public:
topDict->namedArgs = topMembers;
return topDict;
}
// simple test function for use during development
static void Test()
{
let parserTest = L"a=1\na1_=13;b=2 // cmt\ndo = (print\n:train:eval) ; x = array[1..13] (i=>1+i*print.message==13*42) ; print = new PrintAction [ message = 'Hello World' ]";
ParseConfigDictFromString(parserTest, L"Test", vector<wstring>())->Dump();
}
};
// globally exported functions to execute the parser

Просмотреть файл

@ -37,6 +37,7 @@ struct TextLocation // position in the text. Lightweight value struct that we ca
// helpers for pretty-printing errors: Show source-code line with ...^ under it to mark up the point of error
static void PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
static std::wstring CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
static void Trace(TextLocation, const wchar_t* traceKind, const wchar_t* op, const wchar_t* exprPath);
// construction
@ -77,8 +78,12 @@ public:
} // where the error happened
virtual const wchar_t* kind() const = 0; // e.g. "warning" or "error"
wstring GetError(const std::wstring& linePrefix) const override
{
return TextLocation::CreateIssueMessage(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
}
// pretty-print this as an error message
void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const
void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const override
{
TextLocation::PrintIssue(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
}
@ -129,7 +134,7 @@ struct Expression
args.push_back(arg2);
}
// diagnostics helper: print the content
void Dump(int indent = 0) const;
void DumpToStream(wstringstream & treeStream, int indent = 0);
};
typedef Expression::ExpressionPtr ExpressionPtr; // circumvent some circular definition problem

Просмотреть файл

@ -1,194 +0,0 @@
// BrainScriptTest.cpp -- some tests
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
#include "Basics.h"
#include "BrainScriptEvaluator.h"
#include "BrainScriptParser.h"
#ifndef let
#define let const auto
#endif
namespace Microsoft { namespace MSR { namespace BS {
using namespace std;
using namespace msra::strfun;
// Note: currently this seems to be the master copy; got to check whether the other one was also changed
//extern wstring standardFunctions, computationNodes, commonMacros;
#if 1 // TODO: these may be newer, merge into Experimentalthingy
static wstring standardFunctions =
L"Print(value, format='') = new PrintAction [ what = value /*; how = format*/ ] \n"
L"Fail(msg) = new FailAction [ what = msg ] \n"
L"RequiredParameter(message) = Fail('RequiredParameter: ' + message) \n"
L"Format(value, format) = new StringFunction [ what = 'Format' ; arg = value ; how = format ] \n"
L"Replace(s, from, to) = new StringFunction [ what = 'Replace' ; arg = s ; replacewhat = from ; withwhat = to ] \n"
L"Substr(s, begin, num) = new StringFunction [ what = 'Substr' ; arg = s ; pos = begin ; chars = num ] \n"
L"Chr(c) = new StringFunction [ what = 'Chr' ; arg = c ] \n"
L"Floor(x) = new NumericFunction [ what = 'Floor' ; arg = x ] \n"
L"Length(x) = new NumericFunction [ what = 'Length' ; arg = x ] \n"
L"Ceil(x) = -Floor(-x) \n"
L"Round(x) = Floor(x+0.5) \n"
L"Abs(x) = if x >= 0 then x else -x \n"
L"Sign(x) = if x > 0 then 1 else if x < 0 then -1 else 0 \n"
L"Min(a,b) = if a < b then a else b \n"
L"Max(a,b) = if a > b then a else b \n"
L"Fac(n) = if n > 1 then Fac(n-1) * n else 1 \n";
static wstring computationNodes = // BUGBUG: optional args not working yet, some scope problem causing a circular reference
L"Mean(z, tag='') = new ComputationNode [ class = 'MeanNode' ; inputs = z /* ; tag = tag */ ]\n"
L"InvStdDev(z, tag='') = new ComputationNode [ class = 'InvStdDevNode' ; inputs = z /* ; tag = tag */ ]\n"
L"PerDimMeanVarNormalization(feat,mean,invStdDev, tag='') = new ComputationNode [ class = 'PerDimMeanVarNormalizationNode' ; inputs = feat:mean:invStdDev /* ; tag = tag */ ]\n"
L"Parameter(outD, inD, tag='parameter') = new ComputationNode [ class = 'LearnableParameterNode' ; outDim = outD ; inDim = inD /*; tag = tag*/ ]\n"
L"Input(dim,tag='features') = Parameter(dim,1,tag=tag) // TODO: for now \n"
L"RowSlice(firstRow, rows, features, tag='') = new ComputationNode [ class = 'RowSliceNode' ; inputs = features ; first = firstRow ; num = rows /* ; tag = tag */ ]\n"
L"Delay(in, delay, tag='') = new RecurrentComputationNode [ class = 'DelayNode' ; inputs = in ; deltaT = -delay /* ; tag = tag */ ]\n"
L"Sigmoid(z, tag='') = new ComputationNode [ class = 'SigmoidNode' ; inputs = z /* ; tag = tag */ ]\n"
L"Log(z, tag='') = new ComputationNode [ class = 'LogNode' ; inputs = z /* ; tag = tag */ ]\n"
L"CrossEntropyWithSoftmax(labels, outZ, tag='') = new ComputationNode [ class = 'CrossEntropyWithSoftmaxNode' ; inputs = labels:outZ /* ; tag = tag */ ]\n"
L"ErrorPrediction(labels, outZ, tag='') = new ComputationNode [ class = 'ErrorPredictionNode' ; inputs = labels:outZ /* ; tag = tag */ ]\n";
static wstring commonMacros = // TODO: rename rows and cols to inDim and outDim or vice versa, whichever it is
L"BFF(in, rows, cols) = [ B = Parameter(rows, 1/*init = fixedvalue, value = 0*/) ; W = Parameter(rows, cols) ; z = W*in+B ] \n"
L"SBFF(in, rows, cols) = [ Eh = Sigmoid(BFF(in, rows, cols).z) ] \n "
L"MeanVarNorm(feat) = PerDimMeanVarNormalization(feat, Mean(feat), InvStdDev(feat)) \n"
L"LogPrior(labels) = Log(Mean(labels)) \n";
#endif
void SomeTests()
{
try
{
// collecting all sorts of test cases here
const wchar_t* parserTests[] =
{
L"do = Parameter(13,42) * Input(42) + Parameter(13,1)",
L"do = Print(array [1..10] (i=>i*i))",
L"do = new PrintAction [ what = 'abc' ]",
L"do = Print(new StringFunction [ x = 13 ; y = 42 ; what = 'Format' ; how = '.2' ; arg = x*y ])",
L"do = Print(\"new StringFunction [ what = 'Format' ; how = '.2' ; arg = '13 > 42' ]\")",
L"do = new PrintAction [ what = if 13 > 42 || 12 > 1 then 'Hello World' + \"!\" else 'Oops?']",
L"i2s(i) = new StringFunction [ what = 'Format' ; arg = i ; how = '.2' ] ; do = Print('result=' + i2s((( [ v = (i => i + delta) ].v(5)))+13)) ; delta = 42 ",
L"do = Print(1+2*3) : Print('hello'+' world')",
L"do = Print(Format( (13:(fortytwo:1):100), '')) ; fortytwo=42 ",
L"do = Print(val) ; val=if !false then 42 else -+-++-13:[a='a';b=42]:+14; arr = array [1..10] (i => 2*i)",
L"do = Print(arg) ; N = 5 ; arr = array [1..N] (i => if i < N then arr[i+1]*i else N) ; arg = arr ",
L"do = Print(val) ; val = [ v = (i => i + offset) ].v(42) ; offset = 13 ",
// #12: DNN with recursion
L"do = Print(val) \n"
L"val = new NDLComputationNetwork [\n"
L" featDim=40*31 ; labelDim=9000 ; hiddenDim=2048 ; numHiddenLayers = 3 \n"
L" myFeatures = Input(featDim) ; myLabels = Input(labelDim) \n"
L" featNorm = MeanVarNorm(myFeatures) \n"
L" HiddenStack(layer) = if layer > 1 then SBFF(HiddenStack(layer - 1).Eh, hiddenDim, hiddenDim) else SBFF(featNorm, hiddenDim, featDim) \n"
L" outLayer = BFF(HiddenStack(numHiddenLayers).Eh, labelDim, hiddenDim) \n"
L" outZ = outLayer.z \n"
L" CE = CrossEntropyWithSoftmax(myLabels, outZ) \n"
L" Err = ErrorPrediction(myLabels, outZ) \n"
L" logPrior = LogPrior(myLabels) \n"
L" ScaledLogLikelihood = outZ - logPrior \n"
L"]\n",
// #13: factorial
L"do = Print(fac(5)) ; fac(i) = if i > 1 then fac(i-1)*i else 1 ",
// #14: Fibonacci sequence with memoization
L"do = Print(fibs(10)) ; fibs(n) = [ vals = array[1..n] (i => if i < 3 then i-1 else vals[i-1]+vals[i-2]) ].vals[n] ",
// #15: DNN with array
L"do = Print(val) \n"
L"val = new NDLComputationNetwork [\n"
L" featDim=40*31 ; labelDim=9000 ; hiddenDim=2048 ; numHiddenLayers = 3 \n"
L" myFeatures = Input(featDim, tag='features') ; myLabels = Input(labelDim, tag='labels') \n"
L" featNorm = MeanVarNorm(myFeatures) \n"
L" layers[layer:1..numHiddenLayers] = if layer > 1 then SBFF(layers[layer-1].Eh, hiddenDim, hiddenDim) else SBFF(featNorm, hiddenDim, featDim) \n"
L" outLayer = BFF(layers[numHiddenLayers].Eh, labelDim, hiddenDim) \n"
L" outZ = outLayer.z + Delay(outZ, 1) \n"
L" CE = CrossEntropyWithSoftmax(myLabels, outZ) \n"
L" Err = ErrorPrediction(myLabels, outZ) \n"
L" logPrior = LogPrior(myLabels) \n"
L" ScaledLogLikelihood = outZ - logPrior \n"
L"]\n",
// #16: windowed RNN
L"do = Print(val) \n"
L"val = new NDLComputationNetwork [ \n"
L" hiddenDim = 512 \n"
L" numHiddenLayers = 2 \n"
L" T = 3 // total context window \n"
L" \n"
L" // data sources \n"
L" featDim = 40 ; labelDim = 9000 \n"
L" myFeatures = Input(featDim) ; myLabels = Input(labelDim) \n"
L" \n"
L" // split the augmented input vector into individual frame vectors \n"
L" subframes[t:0..T - 1] = RowSlice(t * featDim, featDim, myFeatures) \n"
L" \n"
L" // hidden layers \n"
L" layers[layer:1..numHiddenLayers] = [ // each layer stores a dict that stores its hidden fwd and bwd state vectors \n"
L" // model parameters \n"
L" W_fwd = Parameter(hiddenDim, featDim) // Parameter(outdim, indim) \n"
L" W_bwd = if layer > 1 then Parameter(hiddenDim, hiddenDim) else Fail('no W_bwd') // input-to-hidden \n"
L" H_fwd = Parameter(hiddenDim, hiddenDim) // hidden-to-hidden \n"
L" H_bwd = Parameter(hiddenDim, hiddenDim) \n"
L" b = Parameter(hiddenDim, 1) // bias \n"
L" // shared part of activations (input connections and bias) \n"
L" z_shared[t:0..T-1] = (if layer > 1 \n"
L" then W_fwd * layers[layer - 1].h_fwd[t] + W_bwd * layers[layer - 1].h_bwd[t] \n"
L" else W_fwd * subframes[t] \n"
L" ) + b \n"
L" // recurrent part and non-linearity \n"
L" step(H, h, dt, t) = Sigmoid(if (t + dt >= 0 && t + dt < T) \n"
L" then z_shared[t] + H * h[t + dt] \n"
L" else z_shared[t]) \n"
L" h_fwd[t:0..T-1] = step(H_fwd, h_fwd, -1, t) \n"
L" h_bwd[t:0..T-1] = step(H_bwd, h_bwd, 1, t) \n"
L" ] \n"
L" // output layer --linear only at this point; Softmax is applied later \n"
L" outLayer = [ \n"
L" // model parameters \n"
L" W_fwd = Parameter(labelDim, hiddenDim) \n"
L" W_bwd = Parameter(labelDim, hiddenDim) \n"
L" b = Parameter(labelDim, 1) \n"
L" // output \n"
L" topHiddenLayer = layers[numHiddenLayers] \n"
L" centerT = Floor(T/2) \n"
L" z = W_fwd * topHiddenLayer.h_fwd[centerT] + W_bwd * topHiddenLayer.h_bwd[centerT] + b \n"
L" ] \n"
L" outZ = outLayer.z // we only want this one & don't care about the rest of this dictionary \n"
L" \n"
L" // define criterion nodes \n"
L" CE = CrossEntropyWithSoftmax(myLabels, outZ) \n"
L" Err = ErrorPrediction(myLabels, outZ) \n"
L" \n"
L" // define output node for decoding \n"
L" logPrior = LogPrior(myLabels) \n"
L" ScaledLogLikelihood = outZ - logPrior // before: Minus(CE.BFF.FF.P,logPrior,tag=Output) \n"
L"]\n",
L" \n" // this fails because dict is outside val; expression name is not local to it
L"do = Print(val) \n"
L"dict = [ outY = Input(13) ] ; val = new NDLComputationNetwork [ outZ = dict.outY \n"
L"]\n",
L"f(x,option='default') = Print(option); do = f(42,option='value')",
NULL};
let first = 0; // 0 for all
bool oneOnly = first > 0;
for (size_t i = first; parserTests[i]; i++)
{
fprintf(stderr, "\n### Test %d ###\n\n", (int) i), fflush(stderr);
let parserTest = parserTests[i];
let expr = ParseConfigDictFromString(standardFunctions + computationNodes + commonMacros + parserTest, L"Test", vector<wstring>());
//expr->Dump();
Do(expr);
if (oneOnly)
break;
}
}
catch (const ConfigException& err)
{
err.PrintError(L"error");
}
}
}}} // namespaces

Просмотреть файл

@ -6,24 +6,175 @@
//
##############################################################################
# standard functions
# Layer constructors
#
# A layer constructor is a stateful function that creates and returns an instance
# of a 'learnable function'. A learnable function is a function object that has
# learnable parameters baked into it, which get trained by SGD.
# Calling a layer constructor twice creates two instances with independent parameters.
#
# Learnable function instances can be applied to data or composed directly into
# more complex models. For example:
# // immediate usage:
# z = LinearLayer{9000}(h) # LinearLayer{9000} returns a new function object
# // composing multiple layers into a model
# model = Sequential ( DenseLayer{2048, activation=Sigmoid} : LinearLayer {9000} )
# z = model (features)
# // applying the same model to two inputs, with shared, jointly updated parameters
# f = DenseLayer{2048, activation=ReLU}
# z1 = f (feat1) ; z2 = f (feat2)
# The names are intentionally kept similar to other toolkits.
#
# Note that functions without parameters can be used as layers directly, e.g. Sigmoid.
##############################################################################
Print(value, format='') = new PrintAction [ what = value /*; how = format*/ ]
Fail(what) = new FailAction [ /*what*/ ]
Format(value, format) = new StringFunction [ what = 'Format' ; arg = value ; how = format ]
Replace(s, from, to) = new StringFunction [ what = 'Replace' ; arg = s ; replacewhat = from ; withwhat = to ]
Substr(s, begin, num) = new StringFunction [ what = 'Substr' ; arg = s ; pos = begin ; chars = num ]
Chr(c) = new StringFunction [ what = 'Chr' ; arg = c ]
Length(x) = new NumericFunction [ what = 'Length' ; arg = x ]
Sign(x) = if x > 0 then 1 else if x < 0 then -1 else 0
Min(a,b) = if a < b then a else b
Max(a,b) = if a > b then a else b
Fac(n) = if n > 1 then Fac(n-1) * n else 1
IsSameObject(a,b) = new CompareFunction [ what = 'IsSameObject' ; args = (a : b) ]
Mod(x, y) = new NumericFunction [ what = 'Mod' ; args = (x:y) ]
IntDiv(x, y) = new NumericFunction [ what = 'IntDiv' ; args = (x:y) ]
# LinearLayer -- create a fully-connected linear projection layer
# Note: outDim may describe a tensor as well.
LinearLayer {outDim} =
{
W = ParameterTensor {_ConcatArrays (outDim, 0), init='uniform'}
b = ParameterTensor {outDim, initValue=0}
outRank = Length (_AsArray (outDim)) # support outputs with tensor layouts
f(x) = Times (W, x, outputRank = outRank) + b
}.f
# DenseLayer -- create a fully-connected layer with optional non-linearity
DenseLayer{outDim, activation=(x=>x)} = Sequential ( LinearLayer{outDim} : activation )
# EmbeddingLayer -- create a linear embedding layer
EmbeddingLayer {outDim, # dimension of embedding
embeddingPath = '', transpose = false} = # load a fixed embedding from a path instead
{
shape = if transpose then (0 : outDim) else (outDim : 0)
E = if embeddingPath == ''
then ParameterTensor {shape, init='uniform'} # learnable
else ParameterTensor {shape, initFromFilePath = embeddingPath, learningRateMultiplier = 0} # fixed from file
TimesOp = if transpose then TransposeTimes else Times
f(x) = TimesOp (E, x) # x is expected to be sparse one-hot
}.f
# ConvolutionalLayer -- create a convolution layer with optional non-linearity
# [ (shifting dims) | (reduction dim) | (output dim) | (sample dims) ]
# in : [ (shifting dims) | (reduction dim) | | (sample dims) ]
# kernel : [ (filter dims) | (reduction dim) | (output dim) | ]
# out : [ (shifting dims)] | | (output dim) | (sample dims) ]
ConvolutionalLayer {numOutputChannels, # e.g. (1) or BS.Constants.None
filterShape, # e.g. (3:3)
init = "uniform",
#reductionRank = 1, # TODO: support this
stride = 1, autoPadding = true,
#lowerPad = 0, upperPad = 0, # TODO: support this
#transpose = false, # TODO: support this
maxTempMemSizeInSamples = 0} =
{
reductionRank = 1 # TODO: shall become an optional parameter
outputChannelsShape = Repeat (1, numOutputChannels) # Repeat(1) turns a scalar into a 1-element array
outputRank = Length (outputChannelsShape)
kernelShape = _ConcatArrays (filterShape, Repeat (reductionRank, 0)) # append reduction dims to filter dims
W = ParameterTensor{_ConcatArrays (kernelDims, outputChannelsShape), init=init}
autoPaddingPadded = _ConcatArrays (_ForceResizeArray (Length (kernelDims), autoPadding), Repeat (reductionRank, false)) # set padding flags for reduction dims to false
sharing = false # TODO: support this
f(x) = Convolution (W, x, kernelShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = autoPaddingPadded, lowerPad = lowerPad, upperPad = upperPad, transpose = transpose, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
}.f
# MaxPoolingLayer, AveragePoolingLayer -- create a max- or average-pooling layer
_PoolingLayer {poolKind, # "max" or "average"
filterShape, # e.g. (3:3)
lowerPad = 0, upperPad = 0} = # TODO: support this
{
f(x) = Pooling (x, poolKind, kernelShape, stride = stride, sharing = sharing, autoPadding = autoPaddingPadded, lowerPad = lowerPad, upperPad = upperPad)
}.f
MaxPoolingLayer {filterShape, stride = 1, autoPadding = true /*, lowerPad = 0, upperPad = 0*/} =
_PoolingLayer {"max", filterShape, stride = stride, autoPadding = autoPadding}
AveragePoolingLayer {filterShape, stride = 1, autoPadding = true /*, lowerPad = 0, upperPad = 0*/} =
_PoolingLayer {"average", filterShape, stride = stride, autoPadding = autoPadding}
# RecurrentLSTMLayer -- create an LSTM layer
RecurrentLSTMLayer {outDim,
cellDim = BS.Constants.None, # if set then use a projection
goBackwards = false,
enableSelfStabilization = false} =
{
cellShape = if BS.Constants.IsNone (cellDim) then outDim else cellDim
# BUGBUG: Calling f(x) twice will create a second set of parameters. Needs to refactor LSTM for this.
f(x) = BS.RNNs.RecurrentLSTMP (outDim, cellDim = cellShape,
x, inputDim = 0,
previousHook = if goBackwards then BS.RNNs.NextHC else BS.RNNs.PreviousHC,
enableSelfStabilization = enableSelfStabilization).h
}.f
# DropoutLayer -- create a drop-out layer
DropoutLayer {prob = BS.Constants.None} = if !BS.Constants.IsNone (prob) then Fail ("DropoutLayer: Dropout probability can currently not be specified per-layer.") else
{
f(x) = Dropout (x)
}.f
# BatchNormalizationLayer -- create a batch-normalization layer
BatchNormalizationLayer {spatialRank = 0, # reduce over these dims. E.g. 2 to reduce over (w,h) in a [W x H x C]-shaped input
initialScale = 1,
normalizationTimeConstant = 0, blendTimeConstant = 0,
epsilon = 0.00001, useCntkEngine = true} =
{
normShape = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
scale = ParameterTensor{normShape, initValue = initialScale}
bias = ParameterTensor{normShape, initValue = 0}
runMean = ParameterTensor{normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
runInvStdDev = ParameterTensor{normShape, initValue = 0, learningRateMultiplier = 0}
f(x) = BatchNormalization (x, scale, bias, runMean, runInvStdDev, spatialDims > 0, normalizationTimeConstant = normalizationTimeConstant, blendTimeConstant = blendTimeConstant, epsilon = epsilon, useCntkEngine = useCntkEngine)
}.f
# LayerNormalizationLayer -- create a layer-normalization layer
LayerNormalizationLayer {dim = BS.Constants.None, initScale = 1, initBias = 0} = if BS.Constants.IsNone (dim) then Fail ("LayerNormalizationLayer: 'dim' parameter is currently required.") else
{
gain = ParameterTensor{(1), initValue = initScale}
bias = ParameterTensor{(1), initValue = initBias}
f(x) = {
div = Constant (1.0 / dim)
# normalize w.r.t. actual sample statistics
mean = div .* ReduceSum (x)
x0 = x - mean;
std = Sqrt (div .* ReduceSum (x0 .* x0))
xHat = ElementDivide (x0, std)
# denormalize with learned parameters
val = xHat .* gain + bias
}.val
}.f
# StabilizerLayer -- create a scalar stabilizer [J. Droppo, 2014 -- TODO: get the reference]
StabilizerLayer {} =
{
# BUGBUG: Calling f(x) twice will create a second set of parameters. Needs to refactor Stabilize() for this.
f(x) = Stabilize (x)
}.f
# FeatureMVNLayer -- create a corpus-level feature-normalization layer
# This can only be applied to features. Statistics are not shared across invocations,
# which is semantically OK because the values are the same. However, it is not efficient.
FeatureMVNLayer {} = MeanVarNorm
# Layers that exist in other tools that we will not have:
# FlattenLayer{}: Not needed since DenseLayer() can handle tensors just fine.
# Activation{}: Not needed since functions can be used directly.
##############################################################################
# Composing layers or models into more more complex models
##############################################################################
# Sequential -- composite that applies a sequence of functions onto an input
Sequential (arrayOfFunctions) =
{
fs = _AsArray (arrayOfFunctions) # make sure it works with a single function that is not an array
Apply (x, N) = if N == 0 then x else fs[N-1](Apply (x, N-1)) # we do that recursively
f(x) = Apply (x, Length (fs))
}.f
Merge (arrayOfFunctions, combineFunction) =
if Length (arrayOfFunctions) != 2 then Fail ("Merge() is currently limited to binary functions.") else
{
f(x,y) = combineFunction (arrayOfFunctions[0](x), arrayOfFunctions[1](y))
}.f
##############################################################################
# aliases
@ -51,9 +202,13 @@ Log = CNTK2.Log
Minus = CNTK2.Minus
Pass = CNTK2.Identity
Plus = CNTK2.Plus
RectifiedLinear = CNTK2.Relu
RectifiedLinear = CNTK2.ReLU # deprecated
ReLU = CNTK2.ReLU
ReduceSum = CNTK2.ReduceSum
ReduceLogSum = CNTK2.ReduceLogSum
ReduceMin = CNTK2.ReduceMin
ReduceMax = CNTK2.ReduceMax
Round = CNTK2.Round
Sigmoid = CNTK2.Sigmoid
@ -89,7 +244,7 @@ CNTK2 = [
// TODO: The API for Parameter is different in current 2.0 design, getting a constant as input for the initial values.
// This needs to be fixed to follow the way the Constant() is exposed in Python
// Making this an internal node with "_" until we agree on the final interface:
_Parameter(shape, value = 0, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]
_Parameter(shape, value = 0, initValue = '', learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]
// 3. Shape operations
// Changes: NewReshape -> Reshape, input -> _, dims -> shape
@ -142,10 +297,12 @@ CNTK2 = [
Tanh(_, tag='') = new ComputationNode [ operation = 'Tanh' ; inputs = _ /*plus the function args*/ ]
// 6. Reductions
# the following is a temporary workaround until we have the C++ version
ReduceLogSum (_, axis=0, tag='') = if axis != 0 then Fail("ReduceLogSum for now only supports axis=0.")
else [ tag1=tag ; axis1=axis ; out = RowSlice (0, 1, _ - LogSoftmax (_), tag=tag1) ].out
ReduceSum (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Sum" /*plus the function args*/ ]
ReduceSum (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Sum" /*plus the function args*/ ]
ReduceLogSum(_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "LogSum" /*plus the function args*/ ]
ReduceMin (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Min" /*plus the function args*/ ]
ReduceMax (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Max" /*plus the function args*/ ]
#ReduceMean (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Mean" /*plus the function args*/ ]
// 7. Control flow (if, composite etc.)
// None so far
@ -158,8 +315,9 @@ CNTK2 = [
PastValue(_, shape, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = _ ; shape = new TensorShape [ /*shape*/ ] /*plus the function args*/ ]
// 10. NN-specific operations
// Changes: input -> _, RectifiedLinear -> Relu. [Use Relu to arrive at relu() in snake_case]
Relu(_, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = _ /*plus the function args*/ ]
// Changes: input -> _, RectifiedLinear -> ReLU
ReLU(_, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = _ /*plus the function args*/ ]
Relu = ReLU // [Use Relu to arrive at relu() in snake_case]
Sigmoid(_, tag='') = new ComputationNode [ operation = 'Sigmoid' ; inputs = _ /*plus the function args*/ ]
Softmax(_, tag='') = new ComputationNode [ operation = 'Softmax' ; inputs = _ /*plus the function args*/ ]
Dropout(_, tag='') = new ComputationNode [ operation = 'Dropout' ; inputs = _ /*plus the function args*/ ]
@ -169,6 +327,10 @@ CNTK2 = [
// empirical sequence is compared to. Keeping this for now.
CrossEntropyWithSoftmax(_, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (_ : outProbVectorSequence) /*plus the function args*/ ]
ErrorPrediction(_, outVectorSequence, topN=1, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = if topN == 1 then (_ : outVectorSequence) else (_ : outVectorSequence : Constant (topN)) /*plus the function args*/ ]
# TODO: replace with this (need to deal with topN thing):
# (_new will be removed once the change is made)
CrossEntropyWithSoftmax_new (L, z, tag='') = Minus (ReduceLogSum (z), TransposeTimes (L, z), tag=tag)
ErrorPrediction_new (L, z, tag='') = Minus (BS.Constants.One, TransposeTimes (L, Hardmax (z)), tag=tag)
// 12. Comparison nodes
Less(_, y, tag='') = new ComputationNode [ operation = 'Less' ; inputs = (_ : y) /*plus the function args*/ ]
@ -182,11 +344,21 @@ CNTK2 = [
Identity(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _ /*plus the function args*/ ]
]
LearnableParameter (outputDim, inputDim, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
Parameter = LearnableParameter // deprecated
# Parameter{} can do several forms of initialization. It is no longer required to say 'init="kind"', so we can clean these up a bit.
# - initValue=scalar, value=array --> initialize from this value --array form not implemented yet
# - initFromFilePath="..." --> read from a data file
# - init="uniform|gaussian" (random init scaled by initValueScale). Warning: This has magic scaling factors. TODO: document them here
# - init="zero"
# deprecated:
# - initFromLiteral="..." (deprecated) --> parse a string literal (obsolete with value=array form)
# - init="fixedValue", value from 'value'
# Warning: Current config will behave unexpected if user mistypes 'initValue' as 'value' (which will be ignored, defaulting to "uniform" init)
Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
LearnableParameter = Parameter // deprecated
# TODO: make Parameter take tensor dims?
ParameterTensor(dims, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, init = 'fromLiteral', initFromLiteral = literal, learningRateMultiplier = 0.0)
ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, initFromLiteral = literal, learningRateMultiplier = 0.0)
# TODO: Deprecate ConstantFromString() in favor of Constant(array expression)
DynamicAxis(tag='') = new ComputationNode [ operation = 'DynamicAxis' ; /*plus the function args*/ ]
Input(dims, dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'InputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
# TODO: change from dynamicAxis by name to dynamicAxis being an actual object
@ -195,8 +367,8 @@ ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxi
SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]
EnvironmentInput(propertyName, tag='') = new ComputationNode [ operation = 'EnvironmentInput' /*plus the function args*/ ]
# TODO: make 'dims' the first parameter, think ConstantTensor<dims> (val)
ConstantTensor(val, dims, tag='') = ParameterTensor(dims, learningRateMultiplier = 0, init = 'fixedValue', value = val)
Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, learningRateMultiplier = 0, init = 'fixedValue', value = val)
ConstantTensor(val, dims, tag='') = ParameterTensor(dims, learningRateMultiplier = 0, initValue = val)
Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, learningRateMultiplier = 0, initValue = val)
PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]
@ -227,7 +399,7 @@ WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNo
ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileDynamicAxis' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
ReconcileMBLayout = ReconcileDynamicAxis # back compat
CastAs (type, data) = ReconcileDynamicAxis (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
Convolution(weightNode, inputValueNode, kernelDims, mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
Convolution(weightNode, inputValueNode, kernelDims, mapDims = 0, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
# ND pooling/unpooling
Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
MaxUnpooling(unpoolInput, poolInput, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxUnpooling' ; inputs = (unpoolInput : poolInput); kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
@ -264,13 +436,10 @@ Mean(dataVectorSequence, tag='') = new ComputationNode [ operation = 'Mean' ; in
Negate(input, tag='') = new ComputationNode [ operation = 'Negate' ; inputs = input /*plus the function args*/ ]
PackedIndex(targetObject, indexSequence, tag='') = new ComputationNode [ operation = 'PackedIndex' ; inputs = (targetObject : indexSequence) /*plus the function args*/ ]
PerDimMeanVarDeNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarDeNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
#PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
PerDimMeanVarNormalization (x, mean, invStdDev) = (x - mean) .* invStdDev
Reciprocal(z, tag='') = new ComputationNode [ operation = 'Reciprocal' ; inputs = z /*plus the function args*/ ]
//# the following is a temporary workaround until we have the C++ version
#ReduceLogSum (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "LogSum" /*plus the function args*/ ]
#ReduceMean (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Mean" /*plus the function args*/ ]
#ReduceMax (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Max" /*plus the function args*/ ]
#ReduceMin (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Min" /*plus the function args*/ ]
Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ]
# TODO: Scale = ElementTimes
ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = (cond : indexSequence : sourceData) /*plus the function args*/ ]
@ -300,15 +469,52 @@ TransposeTimes(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operatio
Where(cond, tag='') = new ComputationNode [ operation = 'Where' ; inputs = cond /*plus the function args*/ ]
##############################################################################
# common macros
# non-neural-network functions
##############################################################################
BFF(in, rows, cols) = [ B = Parameter(rows, 1, init = 'fixedValue', value = 0) ; W = Parameter(rows, cols) ; z = W*in+B ]
Print(value, format='') = new PrintAction [ what = value /*; how = format*/ ]
Fail(what) = new FailAction [ /*what*/ ]
Format(value, format) = new StringFunction [ what = 'Format' ; arg = value ; how = format ]
Replace(s, from, to) = new StringFunction [ what = 'Replace' ; arg = s ; replacewhat = from ; withwhat = to ]
Substr(s, begin, num) = new StringFunction [ what = 'Substr' ; arg = s ; pos = begin ; chars = num ]
Chr(c) = new StringFunction [ what = 'Chr' ; arg = c ]
Length(x) = new NumericFunction [ what = 'Length' ; arg = x ]
Repeat (N, what) = if N <= 0 then BS.Constants.None else (Repeat (N-1, what) : what) # can also be used to turn a scalar into a 1-element array
_ForceResizeArray (N, arrayOrScalar) = { # bring an array to a given length, either by chopping or by duplicating its last value
arr = _AsArray (arrayOrScalar)
L = Length (arr)
res = if N < L then array[0..N-1] (i => arr[i]) # chop to length
else if L == 0 then Fail ("BottomlessExpansion(): needs at least one element to expand.")
else _ConcatArrays (arr, Repeat (N-L, arr[L-1])) # append copies of the last value
}.res
_AsArray (x) = if IsArray (x) then x else [| x |] # helper to allow dimensions to describe scalars (42) or tensors (13:42)
_ConcatArrays (aOrScalar, bOrScalar) = {
a = _AsArray (aOrScalar) ; b = _AsArray (bOrScalar)
newLen = Length (a)+Length(b)
res = if newLen == 0 then BS.Constants.None else array[0..newLen-1] (i => if i < Length (a) then a[i] else b[i-Length (a)])
}.res
Sign(x) = if x > 0 then 1 else if x < 0 then -1 else 0
Min(a,b) = if a < b then a else b
Max(a,b) = if a > b then a else b
Fac(n) = if n > 1 then Fac(n-1) * n else 1
IsSameObject(a,b) = new CompareFunction [ what = 'IsSameObject' ; args = (a : b) ]
IsArray(a) = new CompareFunction [ what = 'IsArray' ; args = a ]
Mod(x, y) = new NumericFunction [ what = 'Mod' ; args = (x:y) ]
IntDiv(x, y) = new NumericFunction [ what = 'IntDiv' ; args = (x:y) ]
##############################################################################
# macros from NDL book
##############################################################################
BFF(in, rows, cols) = [ B = Parameter(rows, 1, initValue = 0) ; W = Parameter(rows, cols) ; z = W*in+B ]
SBFF(in, rows, cols) = [ Eh = Sigmoid(BFF(in, rows, cols).z) ]
MeanVarNorm(feat) = PerDimMeanVarNormalization(feat, Mean(feat), InvStdDev(feat))
LogPrior(labels) = Log(Mean(labels))
Embedding (embeddingDim, input, inputDim=input.dim, initFrom='fromFile'/*|gaussian|uniform*/, embeddingPath = '', sparseInput = false, learningRateWeight = 0.0) = [
# specify one of these two for initialization:
# - init = "uniform"|"gaussian"
# - embeddingFile = PATHNAME
Embedding (embeddingDim, input, inputDim=input.dim, initFrom=''/*|fromFile|gaussian|uniform*/, embeddingPath = '', sparseInput = false, learningRateWeight = 0.0) = [
embedding = Transpose (LearnableParameter (inputDim, embeddingDim, learningRateMultiplier = learningRateWeight, init = initFrom, initFromFilePath = embeddingPath))
lookup = if sparseInput then embedding * input
else GatherPacked (input, embedding)
@ -341,7 +547,7 @@ Constants = [
# is this like Sequences.Repeat?
True = 1
False = 0
None = ConstantTensor (42, (1))
None = [| |] # doubles up as an empty array. Note: only use [| |] syntax inside here, as it may change in the future
IsNone (x) = IsSameObject (x, None)
]
@ -553,7 +759,7 @@ Parameters =
[
WeightParam (outputDim, inputDim) = Parameter (outputDim, inputDim, init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1)
DiagWeightParam (outputDim) = ParameterTensor ((outputDim), init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1) # meant to be applied elementwise
BiasParam (dim) = ParameterTensor ((dim), init='fixedValue', value=0.0)
BiasParam (dim) = ParameterTensor ((dim), initValue=0.0)
ScalarParam() = BiasParam (1)
# route input through an extra weight, for stabilization
@ -561,16 +767,16 @@ Parameters =
if enabled
then [
#beta = Exp (BiasParam ((inputDim))) # init value is 0
#beta = ParameterTensor ((inputDim), init='fixedValue', value=1.0) # init value is 1
#beta = ParameterTensor ((inputDim), initValue=1.0) # init value is 1
# or SoftPlus: ln(1+e^beta)
#beta = Log (Constants.One + Exp (ParameterTensor ((inputDim), init='fixedValue', value=0.54132485/*ln (e-1)*/))) # init value is 1
#beta = Log (Constants.One + Exp (ParameterTensor ((inputDim), initValue=0.54132485/*ln (e-1)*/))) # init value is 1
# sharpened Softplus: 1/f ln(1+e^{f*beta})
# this behaves linear for weights around 1, yet guarantees positiveness
f = ConstantTensor (4, (1))
fInv = Reciprocal (f)
beta = fInv .* Log (Constants.One + Exp (f .* ParameterTensor ((inputDim), init='fixedValue', value=0.99537863/* 1/f*ln (e^f-1) */))) # init value is 1
beta = fInv .* Log (Constants.One + Exp (f .* ParameterTensor ((inputDim), initValue=0.99537863/* 1/f*ln (e^f-1) */))) # init value is 1
TraceDense (h, what) = h # delete h and uncomment Trace to trace the beta values. They are a valuable indicator.
//Trace (h, say=what, logFirst=10, logFrequency=100, logGradientToo=false, onlyUpToRow=9, onlyUpToT=25, format=[ type = "real" ; transpose = false ; precisionFormat = ".6" ])
@ -1033,6 +1239,7 @@ Seq2Seq =
Network = [
Load(pathName) = new ComputationNetworkFromFile [ /*pathName; also needs 'precision' somewhere*/ ]
CloneFunction (inputNodes, outputNodes, parameters="learnable" /*|"constant"|"shared"*/) = new CloneFunctionConfigLambda [ /*args*/ ]
Edit(inputModel, editFunctions, additionalRoots) = new ComputationNetworkWithEdits [ /*inputModel, editFunctions, additionalRoots*/ ]
Editing = [

Просмотреть файл

@ -8,6 +8,10 @@
#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _
#include "stdafx.h"
#ifdef _WIN32
#include <crtdbg.h>
#endif
#include "Basics.h"
#include "Actions.h"
#include "ComputationNetwork.h"
@ -18,6 +22,7 @@
#include "NDLNetworkBuilder.h"
#include "ModelEditLanguage.h"
#include "CPUMatrix.h" // used for SetNumThreads()
#include "GPUMatrix.h" // used for SyncGuard::EnableSync()
#include "CommonMatrix.h"
#include "SGD.h"
#include "MPIWrapper.h"
@ -440,11 +445,6 @@ static wstring PathToBSStringLiteral(const wstring& path) // quote a pathname fo
return L'"' + path + L'"';
}
// TODO: decide where these should go. Also, do we need three variables?
//extern wstring standardFunctions;
//extern wstring commonMacros;
//extern wstring computationNodes;
int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & reports Win32 exceptions
{
vector<wstring> args(argv, argv + argc);
@ -488,7 +488,6 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
bs += L"include \'cntk.core.bs'"; // start with including the standard macros
// Note: Using lowercase ^^ here to match the Linux name of the CNTK exe.
//bs += standardFunctions + computationNodes + commonMacros + L"\n";
for (const auto& sourceFile : sourceFiles)
bs += L"include " + PathToBSStringLiteral(sourceFile) + L"\n";
bs += L"\n]\n";
@ -538,6 +537,10 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
bool synchronizeCUDAKernelExecutions = config(L"synchronizeCUDAKernelExecutions", false);
if (synchronizeCUDAKernelExecutions)
SyncGuard::EnableSync();
// logging
wstring logpath = config(L"stderr", L"");
if (logpath != L"")
@ -581,13 +584,11 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
if (actionsVal.Is<ScriptableObjects::ConfigArray>())
{
const ScriptableObjects::ConfigArray& actions = actionsVal;
for (int i = actions.GetIndexRange().first; i <= actions.GetIndexRange().second; i++)
for (int i = actions.GetIndexBeginEnd().first; i < actions.GetIndexBeginEnd().second; i++)
{
// TODO: When running in parallel with MPI, only commands in 'commandstoRunOnAllRanks' should
// be run in parallel across multiple ranks. Others should only run on rank 0
actions.At(i, [](const wstring&)
{
}); // this will evaluate and thus execute the action
actions.At(i, [](const wstring&){}); // this will evaluate and thus execute the action
}
}
// else action has already been executed, see comment above
@ -823,15 +824,38 @@ static void LogDelayLoadError(PEXCEPTION_POINTERS pExcPointers)
}
}
#if _DEBUG
// in case of asserts in debug mode, print the message into stderr and throw exception
int HandleDebugAssert(int, // reportType - ignoring reportType, printing message and aborting for all reportTypes
char *message, // message - fully assembled debug user message
int * returnValue) // returnValue - retVal value of zero continues execution
{
fprintf(stderr, "C-Runtime: %s\n", message);
if (returnValue) {
*returnValue = 0; // return value of 0 will continue operation and NOT start the debugger
}
return TRUE; // returning TRUE will make sure no message box is displayed
}
#endif
int wmain(int argc, wchar_t* argv[]) // wmain wrapper that reports Win32 exceptions
{
set_terminate(TerminateThis); // insert a termination handler to ensure stderr gets flushed before actually terminating
_set_error_mode(_OUT_TO_STDERR); // make sure there are no CRT prompts when CNTK is executing
// Note: this does not seem to work--processes with this seem to just hang instead of terminating
__try
{
return wmain1(argc, argv);
// in case of asserts in debug mode, print the message into stderr and throw exception
if (_CrtSetReportHook2(_CRT_RPTHOOK_INSTALL, HandleDebugAssert) == -1) {
LOGPRINTF(stderr, "CNTK: _CrtSetReportHook2 failed.\n");
return -1;
}
int mainReturn = wmain1(argc, argv);
_CrtSetReportHook2(_CRT_RPTHOOK_REMOVE, HandleDebugAssert);
return mainReturn;
}
__except (LogDelayLoadError(GetExceptionInformation()), EXCEPTION_EXECUTE_HANDLER)
{

Просмотреть файл

@ -81,7 +81,7 @@
<StackReserveSize>100000000</StackReserveSize>
</Link>
<PreBuildEvent>
<Command>prebuild.bat "$(Configuration)" "$(CudaPath)"</Command>
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
</PreBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -109,7 +109,7 @@
<StackReserveSize>100000000</StackReserveSize>
</Link>
<PreBuildEvent>
<Command>prebuild.bat "$(Configuration)" "$(CudaPath)"</Command>
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
</PreBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
@ -144,6 +144,7 @@
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\Common\CrossProcessMutex.h" />
<ClInclude Include="..\Common\Include\basetypes.h" />
<ClInclude Include="..\Common\Include\Basics.h" />
<ClInclude Include="..\Common\Include\BestGpu.h" />
<ClInclude Include="..\Common\Include\DataReader.h" />
@ -199,7 +200,6 @@
<ItemGroup>
<ClCompile Include="BrainScript\BrainScriptEvaluator.cpp" />
<ClCompile Include="BrainScript\BrainScriptParser.cpp" />
<ClCompile Include="BrainScript\BrainScriptTest.cpp" />
<ClCompile Include="CNTK.cpp" />
<ClCompile Include="ModelEditLanguage.cpp" />
<ClCompile Include="stdafx.cpp" />
@ -222,4 +222,4 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>
</Project>

Просмотреть файл

@ -1,18 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="..\Common\DataReader.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="..\Common\DataWriter.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="..\Common\File.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="..\Common\fileutil.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="ModelEditLanguage.cpp">
<Filter>Model Editing</Filter>
</ClCompile>
@ -22,34 +10,13 @@
<ClCompile Include="tests.cpp">
<Filter>Misc</Filter>
</ClCompile>
<ClCompile Include="..\Common\TimerUtility.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="CNTK.cpp" />
<ClCompile Include="..\Common\MPIWrapper.cpp">
<Filter>MPI Interfacing</Filter>
</ClCompile>
<ClCompile Include="..\Common\Include\ConcStack.h">
<Filter>Common\Include</Filter>
</ClCompile>
<ClCompile Include="..\Common\Config.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="BrainScript\BrainScriptEvaluator.cpp">
<Filter>BrainScript</Filter>
</ClCompile>
<ClCompile Include="BrainScript\BrainScriptParser.cpp">
<Filter>BrainScript</Filter>
</ClCompile>
<ClCompile Include="BrainScript\BrainScriptTest.cpp">
<Filter>BrainScript</Filter>
</ClCompile>
<ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="..\Common\CompositeDataReader.cpp">
<Filter>Common</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\Common\Include\fileutil.h">
@ -205,9 +172,8 @@
<ClInclude Include="..\Readers\ReaderLib\Transformer.h">
<Filter>from ReaderLib</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\CompositeDataReader.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\basetypes.h" />
<ClInclude Include="..\Readers\CompositeDataReader\CompositeDataReader.h" />
</ItemGroup>
<ItemGroup>
<Text Include="modelEditor.txt">

Просмотреть файл

@ -591,7 +591,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
std::string paramPath = params[1];
NetNdl<ElemType>* netNdl;
vector<ComputationNodeBasePtr> nodes = FindSymbols(params[0], netNdl);
vector<ComputationNodeBasePtr> nodes = FindSymbols(nodeName, netNdl);
for (auto& pNodes : nodes)
{

Просмотреть файл

@ -180,7 +180,7 @@ public:
auto nodePtr = builder.CreateLearnableParameter(name, 1, 1);
ndlNode->SetEvalValue(nodePtr.get());
ElemType val = ndlNode->GetScalar();
nodePtr->Value().SetValue(val);
cn->InitLearnableParameters(nodePtr, L"fixedValue", val);
}
}
}

Просмотреть файл

@ -7,10 +7,23 @@ setlocal enableDelayedexpansion
::: for full license information.
::: ==============================================================================
:::
::: This is called as a pre-build step for the CNTK executable.
::: It receives the build's configuration, $(Configuration), as first paramter.
::: This is called as a pre-build step for the CNTK executable, taking parameters below.
::: It creates buildinfo.h, which makes version information available to the executable itself.
:: Grab the parameters
::
:: Note: don't rely on environment variables, since properties may have been
:: overridden at msbuild invocation. By convention, we let parameters start with p_, locals with l_.
:: A Vim search for [%!]\([lp]_\)\@!\w\+[%!:] should only match
:: well-known (non-CNTK-specific) environment variables.
set p_Configuration=%~1
set p_CNTK_MKL=%~2
set p_CNTK_MKL_SEQUENTIAL=%~3
set p_CNTK_ENABLE_1BitSGD=%~4
set p_CudaPath=%~5
set p_CUDNN_PATH=%~6
set p_CUB_PATH=%~7
echo #ifndef _BUILDINFO_H > buildinfo.h$$
echo #define _BUILDINFO_H >> buildinfo.h$$
@ -23,19 +36,19 @@ if not errorlevel 1 (
call git --version > NUL 2>&1
if not errorlevel 1 (
echo #define _GIT_EXIST >> buildinfo.h$$
FOR /F %%i IN ('call git rev-parse --abbrev-ref HEAD') DO SET BRANCH=%%i
FOR /F %%i IN ('call git rev-parse HEAD') DO SET COMMIT=%%i
set STATUS=
FOR /F %%i IN ('call git rev-parse --abbrev-ref HEAD') DO SET l_BRANCH=%%i
FOR /F %%i IN ('call git rev-parse HEAD') DO SET l_COMMIT=%%i
set l_STATUS=
call git diff --quiet --cached
if not errorlevel 1 call git diff --quiet
if errorlevel 1 set STATUS= ^(modified^)
echo #define _BUILDBRANCH_ "!BRANCH!" >> buildinfo.h$$
echo #define _BUILDSHA1_ "!COMMIT!!STATUS!">> buildinfo.h$$
if errorlevel 1 set l_STATUS= ^(modified^)
echo #define _BUILDBRANCH_ "!l_BRANCH!" >> buildinfo.h$$
echo #define _BUILDSHA1_ "!l_COMMIT!!l_STATUS!">> buildinfo.h$$
)
)
if "%CNTK_MKL%" == "1" (
if "%CNTK_MKL_SEQUENTIAL%" == "1" (
if "%p_CNTK_MKL%" == "1" (
if "%p_CNTK_MKL_SEQUENTIAL%" == "1" (
echo #define _MATHLIB_ "mkl-sequential">> buildinfo.h$$
) else (
echo #define _MATHLIB_ "mkl">> buildinfo.h$$
@ -49,42 +62,40 @@ echo #define _BUILDER_ "%USERNAME%" >> buildinfo.h$$
echo #define _BUILDER_ "%USERNAME%" >> buildinfo.h$$
echo #define _BUILDMACHINE_ "%HOST%" >> buildinfo.h$$
set scriptpath=%~dp0
set buildpath="%scriptpath:\=\\%"
echo #define _BUILDPATH_ %buildpath% >> buildinfo.h$$
set l_scriptpath=%~dp0
set l_buildpath="%l_scriptpath:\=\\%"
echo #define _BUILDPATH_ %l_buildpath% >> buildinfo.h$$
set build_type=Unknown
set build_target=Unknown
set l_build_type=Unknown
set l_build_target=Unknown
:: Configuration property provided by CNTK.vcxproj
if /i "%~1" == "Debug" set build_type=Debug&set build_target=GPU
if /i "%~1" == "Debug_CpuOnly" set build_type=Debug&set build_target=CPU-only
if /i "%~1" == "Release" set build_type=Release&set build_target=GPU
if /i "%~1" == "Release_CpuOnly" set build_type=Release&set build_target=CPU-only
if /i "%p_Configuration%" == "Debug" set l_build_type=Debug&set l_build_target=GPU
if /i "%p_Configuration%" == "Debug_CpuOnly" set l_build_type=Debug&set l_build_target=CPU-only
if /i "%p_Configuration%" == "Release" set l_build_type=Release&set l_build_target=GPU
if /i "%p_Configuration%" == "Release_CpuOnly" set l_build_type=Release&set l_build_target=CPU-only
echo #define _BUILDTYPE_ "%build_type%">> buildinfo.h$$
echo #define _BUILDTARGET_ "%build_target%">> buildinfo.h$$
echo #define _BUILDTYPE_ "%l_build_type%">> buildinfo.h$$
echo #define _BUILDTARGET_ "%l_build_target%">> buildinfo.h$$
if "%CNTK_ENABLE_1BitSGD%" == "true" (
if "%p_CNTK_ENABLE_1BitSGD%" == "true" (
echo #define _WITH_1BITSGD_ "yes">>buildinfo.h$$
) else (
echo #define _WITH_1BITSGD_ "no">>buildinfo.h$$
)
if not %build_target% == CPU-only (
:: CudaPath property provided by CNTK.vcxproj
if "%~2%" == "" (
if not %l_build_target% == CPU-only (
if "%p_CudaPath%" == "" (
echo #define _CUDA_PATH_ "NOT_DEFINED" >> buildinfo.h$$
) else (
set cudaPathTemp=%~2
echo #define _CUDA_PATH_ "!cudaPathTemp:\=\\!" >> buildinfo.h$$
echo #define _CUDA_PATH_ "!p_CudaPath:\=\\!" >> buildinfo.h$$
)
if not "%cudnn_path%" == "" (
echo #define _CUDNN_PATH_ "%cudnn_path:\=\\%" >> buildinfo.h$$
if not "%p_CUDNN_PATH%" == "" (
echo #define _CUDNN_PATH_ "%p_CUDNN_PATH:\=\\%" >> buildinfo.h$$
)
if not "%cub_path%" == "" (
echo #define _CUB_PATH_ "%cub_path:\=\\%" >> buildinfo.h$$
if not "%p_CUB_PATH%" == "" (
echo #define _CUB_PATH_ "%p_CUB_PATH:\=\\%" >> buildinfo.h$$
)
)

Просмотреть файл

@ -153,6 +153,11 @@ namespace CNTK
static const size_t InferredDimension = (size_t)-1;
public:
///
/// Construct a NDShape with 0 axes, which denotes a scalar.
///
NDShape() {}
///
/// Contruct a NDShape instance with the specified number of axes and dimensionality in each axis.
///
@ -285,6 +290,7 @@ namespace CNTK
class NDArrayView final : public std::enable_shared_from_this<NDArrayView>
{
friend class CompositeFunction;
friend class LearnerBase;
template <typename T, typename ...CtorArgTypes>
friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
@ -429,6 +435,16 @@ namespace CNTK
///
bool IsReadOnly() const { return m_isReadOnly; }
// TODO: The set methods should be offered in template from
///
/// Fill 'this' NDArrayView with the specified value. The underlying DataType of 'this' view should be DataType::Float.
///
CNTK_API void SetValue(float value);
///
/// Fill 'this' NDArrayView with the specified value. The underlying DataType of 'this' view should be DataType::Double.
///
CNTK_API void SetValue(double value);
///
/// Creates a new NDArrayView with newly allocated storage on the same device as 'this' view and copies 'this' view's contents into the newly allocated view.
///
@ -467,8 +483,6 @@ namespace CNTK
private:
CNTK_API NDArrayView(CNTK::DataType dataType, const DeviceDescriptor& device, CNTK::StorageFormat storageType, const NDShape& viewShape, bool readOnly, void* tensorView);
CNTK_API void SetValue(float value);
CNTK_API void SetValue(double value);
template <typename ElementType>
static std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetMatrixImpl(const Microsoft::MSR::CNTK::TensorView<ElementType>* tensorView, size_t rowColSplitPoint);
@ -526,6 +540,11 @@ namespace CNTK
///
CNTK_API void Clear();
///
/// Returns the number of masked/invalid values
///
CNTK_API size_t MaskedCount() const;
///
/// Returns the descriptor of the device that 'this' mask resides on
///
@ -536,6 +555,11 @@ namespace CNTK
///
const NDShape& Shape() const { return m_maskShape; }
///
/// Returns a read-only pointer to the data buffer underlying 'this' Mask object
///
CNTK_API const char* DataBuffer() const;
///
/// Creates a new NDMask with newly allocated storage on the same device as 'this' mask and copies 'this' mask's contents into the newly allocated mask.
///
@ -760,7 +784,21 @@ namespace CNTK
///
/// Create an 'Input' Variable.
///
Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name = L"")
Variable(const NDShape& shape, CNTK::DataType dataType)
: Variable(shape, dataType, L"")
{}
///
/// Create an 'Input' Variable.
///
Variable(const NDShape& shape, CNTK::DataType dataType, const wchar_t* name)
: Variable(shape, dataType, std::wstring(name))
{}
///
/// Create an 'Input' Variable.
///
Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name)
: Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, false, { Axis::DefaultDynamicAxis() }, false, name)
{}
@ -919,6 +957,10 @@ namespace CNTK
return first.m_dataFields == second.m_dataFields;
}
inline bool operator!=(const Variable& first, const Variable& second)
{
return !(first == second);
}
///
/// Denotes Parameter inputs of a Function.
///
@ -1146,7 +1188,7 @@ namespace CNTK
/// and the user is responsible for ensuring that the contents of the inputs and outputs are unchanged until after any uses of the BackPropState instance
/// for backpropagating gradients through this function.
///
CNTK_API virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
CNTK_API virtual BackPropStatePtr Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
std::unordered_map<Variable, ValuePtr>& outputs,
const DeviceDescriptor& computeDevice = DeviceDescriptor::DefaultDevice(),
const std::unordered_set<Variable>& outputsToRetainBackwardStateFor = {}) = 0;
@ -1161,7 +1203,7 @@ namespace CNTK
/// computation that this gradient backpropagation corresponds to.
///
CNTK_API virtual void Backward(const BackPropStatePtr& state,
const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) = 0;
public:
@ -1330,10 +1372,74 @@ namespace CNTK
};
///
/// Create an instance of the CNTK built-in matrix multiplication operation with the specified input operands.
/// TODO: Specify the constraints on the shapes of the operands.
/// Create an instance of the CNTK built-in elementwise negate operation with the specified input operand.
///
CNTK_API FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
CNTK_API FunctionPtr Negate(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise sigmoid operation with the specified input operand.
///
CNTK_API FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise tanh operation with the specified input operand.
///
CNTK_API FunctionPtr Tanh(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise linear rectifier operation with the specified input operand.
///
CNTK_API FunctionPtr ReLU(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise exp operation with the specified input operand.
///
CNTK_API FunctionPtr Exp(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise log operation with the specified input operand.
///
CNTK_API FunctionPtr Log(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise square operation with the specified input operand.
///
CNTK_API FunctionPtr Square(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise square-root operation with the specified input operand.
///
CNTK_API FunctionPtr Sqrt(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise round operation with the specified input operand.
///
CNTK_API FunctionPtr Round(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise floor operation with the specified input operand.
///
CNTK_API FunctionPtr Floor(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise ceil operation with the specified input operand.
///
CNTK_API FunctionPtr Ceil(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise abs operation with the specified input operand.
///
CNTK_API FunctionPtr Abs(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise reciprocal operation with the specified input operand.
///
CNTK_API FunctionPtr Reciprocal(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in softmax operation on specified tensor input operand
///
CNTK_API FunctionPtr Softmax(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise tensor addition operation with the specified input operands.
@ -1341,30 +1447,71 @@ namespace CNTK
CNTK_API FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise sigmoid operation with the specified input operand.
/// Create an instance of the CNTK built-in elementwise tensor subtraction operation with the specified input operands.
///
CNTK_API FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name = L"");
CNTK_API FunctionPtr Minus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise tanh operation with the specified input operand.
/// Create an instance of the CNTK built-in elementwise multiplication operation on specified tensor input operands.
///
CNTK_API FunctionPtr Tanh(const Variable& operand, const std::wstring& name = L"");
CNTK_API FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise division operation on specified tensor input operands.
///
CNTK_API FunctionPtr ElementDivide(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise equality comparison operation on specified tensor input operands.
///
CNTK_API FunctionPtr Equal(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise not-equal comparison operation on specified tensor input operands.
///
CNTK_API FunctionPtr NotEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise less than comparison operation on specified tensor input operands.
///
CNTK_API FunctionPtr Less(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise less than or equal to comparison operation on specified tensor input operands.
///
CNTK_API FunctionPtr LessEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise greater than comparison operation on specified tensor input operands.
///
CNTK_API FunctionPtr Greater(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise greater than or equal to comparison operation on specified tensor input operands.
///
CNTK_API FunctionPtr GreaterEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in matrix multiplication operation with the specified input operands.
/// TODO: Specify the constraints on the shapes of the operands.
///
CNTK_API FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes = 1, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in operation to compute squared-error for specified input operands.
///
CNTK_API FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in operation to compute cross-entropy with softmax for specified input operands.
///
CNTK_API FunctionPtr CrossEntropyWithSoftmax(const Variable& output, const Variable& labels, const std::wstring& name = L"");
CNTK_API FunctionPtr CrossEntropyWithSoftmax(const Variable& prediction, const Variable& labels, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in operation for computing the classification prediction error for specified operands.
///
CNTK_API FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise exp operation with the specified input operand.
///
CNTK_API FunctionPtr Exp(const Variable& operand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in operation for getting the past value along the lone dynamic axis of the specified operand.
/// Throws an exception of the operand has more than one dynamic axis.
@ -1379,21 +1526,582 @@ namespace CNTK
///
CNTK_API FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in elementwise multiplication operation on specified tensor input operands.
///
CNTK_API FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
///
/// Create an instance of the CNTK built-in sum reduction operation on specified tensor input operand along all the axes
///
CNTK_API FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name = L"");
///
/// Per dimension mean-variance normalization of the specified input operand.
///
CNTK_API FunctionPtr PerDimMeanVarianceNormalize(const Variable& operand, const NDArrayViewPtr& mean, const NDArrayViewPtr& invStdDev, const std::wstring& name = L"");
///
/// TODO:
///
CNTK_API FunctionPtr Convolution(const Variable& convolutionMap,
const Variable& operand,
const NDShape& strides = {1},
const std::vector<bool>& sharing = {true},
const std::vector<bool>& autoPadding = {true},
const NDShape& lowerPad = {0},
const NDShape& upperPad = {0},
bool transpose = false,
size_t maxTempMemSizeInSamples = 0,
const std::wstring& name = L"");
///
/// TODO:
///
enum class PoolingType
{
Max,
Average,
};
///
/// TODO:
///
CNTK_API FunctionPtr Pooling(const Variable& operand,
PoolingType poolingType,
const NDShape& poolingWindowShape,
const NDShape& strides = {1},
const std::vector<bool>& autoPadding = {false},
const NDShape& lowerPad = {0},
const NDShape& upperPad = {0},
const std::wstring& name = L"");
///
/// TODO:
///
CNTK_API FunctionPtr BatchNormalization(const Variable& operand,
const Variable& scale,
const Variable& bias,
const Variable& runningMean,
const Variable& runningInvStd,
bool spacial,
double normalizationTimeConstant = 0,
double blendTimeConstant = 0,
double epsilon = 0.00001,
bool useCuDNNEngine = false,
const std::wstring& name = L"");
///
/// Create a new Function instance which just combines the outputs of the specified list of 'operands' Functions such that the 'Outputs' of the
/// new 'Function' are union of the 'Outputs' of each of the specified 'operands' Functions.
/// E.g. When creating a classification model, typically the CrossEntropy loss Function and the ClassificationError Function comprise the two roots
/// of the computation graph which can be "Combine"d to create a single Function with 2 outputs; viz. CrossEntropy loss and ClassificationError output.
///
CNTK_API FunctionPtr Combine(const std::initializer_list<FunctionPtr>& operands, const std::wstring& name = L"");
CNTK_API FunctionPtr Combine(const std::vector<FunctionPtr>& operands, const std::wstring& name = L"");
///
/// Load a legacy CNTK v1 format model
///
template <typename ElementType>
CNTK_API FunctionPtr LoadLegacyModel(const std::wstring& modelFile, const DeviceDescriptor& computeDevice = DeviceDescriptor::DefaultDevice());
///
/// Save a Composite Function instance to a file in CNTK legacy model format
///
template <typename ElementType>
CNTK_API void SaveAsLegacyModel(const FunctionPtr& rootFunction, const std::wstring& modelFile);
///
/// A serializable value represents one of:
/// a) Boolean
/// b) Signed long integer
/// c) Single and double precision floating point values
/// d) NDShape
/// e) vector<DictionaryValue>
///
/// TODO: We need to have native support for DictionaryValue<vector> and DictionaryValue<NDArrayView>.
class DictionaryValue final
{
public:
enum class Type : unsigned int
{
None,
Bool,
SizeT,
Float,
Double,
String,
NDShape,
Vector,
Dictionary,
};
static const char* TypeName(Type type)
{
switch (type)
{
case Type::None:
return "None";
case Type::Bool:
return "Bool";
case Type::SizeT:
return "SizeT";
case Type::Float:
return "Float";
case Type::Double:
return "Double";
case Type::String:
return "String";
case Type::NDShape:
return "NDShape";
case Type::Vector:
return "Vector";
case Type::Dictionary:
return "Dictionary";
default:
LogicError("Unknown DictionaryValue::Type");
}
}
public:
DictionaryValue() : m_valueType(Type::None)
{
}
DictionaryValue(bool value) : m_valueType(GetValueType<bool>())
{
m_data.m_boolean = value;
}
DictionaryValue(size_t value) : m_valueType(GetValueType<size_t>())
{
m_data.m_sizeT = value;
}
DictionaryValue(float value) : m_valueType(GetValueType<float>())
{
m_data.m_float = value;
}
DictionaryValue(double value) : m_valueType(GetValueType<double>())
{
m_data.m_double = value;
}
DictionaryValue(const wchar_t* value)
: DictionaryValue(std::wstring(value))
{}
template <typename T>
DictionaryValue(const T& value) : m_valueType(GetValueType<T>())
{
static_assert(std::is_same<T, NDShape>::value ||
std::is_same<T, std::wstring>::value ||
std::is_same<T, std::vector<DictionaryValue>>::value ||
std::is_same<T, Dictionary>::value,
"Unsupported ValueType");
AllocateDataPtr(value);
}
DictionaryValue(const DictionaryValue& other) : m_valueType(Type::Bool)
{
// The m_valueType must have been set to a non-ptr type to prevent an attempt to interpret
// the underlying underlying uninitialized value as a ptr and free it.
*this = other;
}
DictionaryValue& operator=(const DictionaryValue& other)
{
if (this != &other)
{
FreeDataPtr();
m_valueType = other.m_valueType;
m_data = other.m_data;
if (other.m_valueType == Type::String)
AllocateDataPtr(other.GetValue<std::wstring>());
else if (other.m_valueType == Type::NDShape)
AllocateDataPtr(other.GetValue<NDShape>());
else if (other.m_valueType == Type::Vector)
AllocateDataPtr(other.GetValue<std::vector<DictionaryValue>>());
else if (other.m_valueType == Type::Dictionary)
AllocateDataPtr(other.GetValue<Dictionary>());
}
return *this;
}
~DictionaryValue()
{
FreeDataPtr();
}
template <typename T, typename std::enable_if<std::is_same<T, bool>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return m_data.m_boolean;
}
template <typename T, typename std::enable_if<std::is_same<T, size_t>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return m_data.m_sizeT;
}
template <typename T, typename std::enable_if<std::is_same<T, float>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return m_data.m_float;
}
template <typename T, typename std::enable_if<std::is_same<T, double>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return m_data.m_double;
}
template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value ||
std::is_same<T, std::wstring>::value ||
std::is_same<T, std::vector<DictionaryValue>>::value ||
std::is_same<T, Dictionary>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return *(reinterpret_cast<T*>(m_data.m_ptr));
}
bool HasValue() const
{
return m_valueType != Type::None;
}
Type ValueType() const
{
return m_valueType;
}
friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us);
friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us);
private:
template <typename T>
static Type GetValueType()
{
static_assert(std::is_same<T, bool>::value ||
std::is_same<T, size_t>::value ||
std::is_same<T, float>::value ||
std::is_same<T, double>::value ||
std::is_same<T, std::wstring>::value ||
std::is_same<T, NDShape>::value ||
std::is_same<T, std::vector<DictionaryValue>>::value ||
std::is_same<T, Dictionary>::value,
"Unsupported ValueType");
if (std::is_same<T, bool>::value) return Type::Bool;
if (std::is_same<T, size_t>::value) return Type::SizeT;
if (std::is_same<T, float>::value) return Type::Float;
if (std::is_same<T, double>::value) return Type::Double;
if (std::is_same<T, std::wstring>::value) return Type::String;
if (std::is_same<T, NDShape>::value) return Type::NDShape;
if (std::is_same<T, std::vector<DictionaryValue>>::value) return Type::Vector;
if (std::is_same<T, Dictionary>::value) return Type::Dictionary;
}
template <typename T>
void VerifyType() const
{
if (GetValueType<T>() != m_valueType)
RuntimeError("Reading a DictionaryValue as the wrong type; Reading as type %s when actual type is %s", typeid(T).name(), DictionaryValue::TypeName(m_valueType));
}
template <typename T>
CNTK_API void AllocateDataPtr(const T& value);
template <typename T>
CNTK_API void FreePtrAsType();
CNTK_API void FreeDataPtr()
{
if (m_valueType == Type::String)
FreePtrAsType<std::wstring>();
else if (m_valueType == Type::NDShape)
FreePtrAsType<NDShape>();
else if (m_valueType == Type::Vector)
FreePtrAsType<std::vector<DictionaryValue>>();
else if (m_valueType == Type::Dictionary)
FreePtrAsType<Dictionary>();
}
Type m_valueType;
union ValueData
{
bool m_boolean;
size_t m_sizeT;
float m_float;
double m_double;
void* m_ptr;
} m_data;
const size_t version = 1;
};
///
/// A type denoting a dictionary (keyed by Unicode strings) of serializable values (dynamically typed).
///
class Dictionary final
{
friend inline void AddConfigString(std::wstringstream& s, const DictionaryValue& value, size_t numIndentationSpaces);
friend class CompositeMinibatchSource;
public:
CNTK_API Dictionary();
CNTK_API ~Dictionary();
CNTK_API Dictionary(const Dictionary&);
CNTK_API Dictionary& operator=(const Dictionary&);
CNTK_API Dictionary(Dictionary&& other);
CNTK_API Dictionary& operator=(Dictionary&& other);
CNTK_API DictionaryValue& operator[](const wchar_t* key);
DictionaryValue& operator[](const std::wstring& key)
{
return operator[](key.c_str());
}
CNTK_API DictionaryValue operator[](const wchar_t* key) const;
DictionaryValue operator[](const std::wstring& key) const
{
return operator[](key.c_str());
}
CNTK_API bool Contains(const wchar_t* key) const;
bool Contains(const std::wstring& key) const
{
return Contains(key.c_str());
}
friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us);
friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us);
private:
std::shared_ptr<std::unordered_map<std::wstring, DictionaryValue>> m_dictionaryData;
const size_t version = 1;
};
///
/// Abstraction for learning a subset of parameters of a learnable function using first order gradient values
/// For e.g momentum, AdaGrad, RMSProp etc. are different types of learners with their own algorithms for
/// learning parameter values using first order gradients.
///
class Learner : public std::enable_shared_from_this<Learner>
{
public:
//
// Method to update the parameters associated with this learner. By returning false, this method indicates that
// learning has stopped for all of the parameters associated with this learner
//
CNTK_API virtual bool Update(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) = 0;
///
/// Returns the set of parameters associated with this learner.
///
const std::unordered_set<Parameter>& Parameters() const { return m_parameters; }
///
/// Optionally overridable method to checkpoint the learner's state.
///
// TODO: move the following two methods into ISerializable interface, make
// Learner (and all other entities that need checkpointing capability) implement it.
CNTK_API virtual Dictionary GetCheckpointState() const { return Dictionary(); }
///
/// Optionally overridable method to restore the learner's state from a previous checkpoint.
///
CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& /*checkpoint*/) {}
virtual ~Learner() {}
protected:
Learner(const std::unordered_set<Parameter>& parameters)
: m_parameters(parameters)
{}
std::unordered_set<Parameter> m_parameters;
};
///
/// Create an instance of the CNTK built-in SGD learner.
///
/// TODO: add additional SGD parameters here (a collection of learning rate values)
CNTK_API LearnerPtr SGDLearner(const std::unordered_set<Parameter>& parameters, double learningRatePerSample);
///
/// Create an instance of the CNTK built-in Momentum SGD learner.
///
/// TODO: add additional Momentum parameters here (a collection of momentum rate values)
CNTK_API LearnerPtr MomentumSGDLearner(const std::unordered_set<Parameter>& parameters);
///
/// Create an instance of the CNTK built-in Nesterov's accelerated SGD learner.
///
CNTK_API LearnerPtr NesterovLearner(const std::unordered_set<Parameter>& parameters);
///
/// Create an instance of the CNTK built-in AdaGrad learner.
///
CNTK_API LearnerPtr AdaGradLearner(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier = true);
///
/// Create an instance of the CNTK built-in FSAdaGrad (improved AdaGrad) learner.
///
CNTK_API LearnerPtr FSAdaGradLearner(const std::unordered_set<Parameter>& parameters);
///
/// Create an instance of the CNTK built-in RMSProp learner.
///
CNTK_API LearnerPtr RMSPropLearner(const std::unordered_set<Parameter>& parameters,
double gamma,
double inc,
double dec,
double max,
double min,
bool needAveMultiplier = true);
///
/// Trainer is the top-level abstraction responsible for the orchestration of the training of a model
/// using the specified learners and training data either explicilty supplied as Value objects or from
/// a MinibatchSource object.
///
class Trainer
{
public:
///
/// Construct a Trainer to train the specified 'model' with the specified 'trainingLoss' Variable as the training criterion
/// and using the specified set of 'parameterLearners' for updating the model's parameters using computed gradients.
///
CNTK_API Trainer(const FunctionPtr& model, const Variable& trainingLoss, const std::unordered_set<LearnerPtr>& parameterLearners);
///
/// Optimize model parameters using the specified 'arguments' minibatch of training samples.
/// Returns false if all parameter learners indicate end of learning (through their Update method's return value).
///
CNTK_API bool TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice = DeviceDescriptor::DefaultDevice());
///
/// Model being trained by 'this' Trainer.
///
FunctionPtr Model() const { return m_model; }
///
/// Variable of the Trainer's model representing the training loss that is used as the optimization
/// criterion for learning the model's parameters.
///
Variable TrainingLossVariable() const { return m_trainingLossVar; }
///
/// Returns the Value of the training loss variable of the model corresponding to the last minibatch trained with
///
ValuePtr PreviousMinibatchTrainingLossValue() const { return m_prevMinibatchTrainingLossValue; }
///
/// Learners associated with this Trainer for updating the model's parameters using computed gradients.
///
const std::unordered_set<LearnerPtr>& ParameterLearners() const { return m_parameterLearners; }
private:
FunctionPtr m_model;
Variable m_trainingLossVar;
ValuePtr m_prevMinibatchTrainingLossValue;
std::unordered_set<LearnerPtr> m_parameterLearners;
};
///
/// Describes an input stream: its name, element type, storage, etc.
///
struct StreamInfo
{
std::wstring m_name; // Unique name of the stream
size_t m_id; // Unique identifier of the stream
StorageFormat m_storageFormat; // Storage format of the stream
DataType m_elementType; // Element type of the stream
NDShape m_sampleLayout; // Layout of the sample for the stream
};
inline bool operator==(const StreamInfo& left, const StreamInfo& right)
{
return ((left.m_id == right.m_id) &&
(left.m_name == right.m_name) &&
(left.m_storageFormat == right.m_storageFormat) &&
(left.m_elementType == right.m_elementType) &&
(left.m_sampleLayout == right.m_sampleLayout));
}
}
namespace std {
template <> struct hash<CNTK::StreamInfo>
{
size_t operator()(const CNTK::StreamInfo& x) const
{
return std::hash<size_t>()(x.m_id);
}
};
}
namespace CNTK
{
struct MinibatchData
{
size_t m_numSequences;
size_t m_numSamples;
ValuePtr m_data;
};
///
/// Abstraction for generating minbatches of samples for training/evaluation.
///
class MinibatchSource : public std::enable_shared_from_this<MinibatchSource>
{
public:
///
/// Describes the streams 'this' MinibatchSource produces.
///
virtual const std::unordered_set<StreamInfo>& StreamInfos() = 0;
///
/// Reads a minibatch that contains data across all input streams.
/// The minibatchData argument specifies the desired minibatch size for each stream of the reader either in terms of #sequences or
/// #samples or both. In case the size is specified in terms of both #sequences and #samples, the smaller of the 2 is taken. The actual
/// returned size of the minibatch is the min across all streams. Also the requested MB size fields in the maps are updated by the
/// MinibatchSource to contain the actual #sequences and #samples in the returned minibatch for the corresponding stream.
/// The return value indciates if the MinibatchSource will return any further data in subsequent calls of this function.
///
virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) = 0;
// TODO: Methods to save and restore from checkpoints
// Disallow copy and move construction and assignment
MinibatchSource(const MinibatchSource&) = delete; MinibatchSource(MinibatchSource&&) = delete; MinibatchSource& operator=(const MinibatchSource&) = delete; MinibatchSource& operator=(MinibatchSource&&) = delete;
protected:
MinibatchSource() {}
};
///
/// Instantiate the CNTK built-in composite minibatch source.
///
CNTK_API MinibatchSourcePtr CreateCompositeMinibatchSource(const Dictionary& configuration);
///
/// Compute the per dimension means and variances for each of the specified streams using data from the specified minibatchSource.
///
CNTK_API void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndVariances,
const DeviceDescriptor& device = DeviceDescriptor::CPUDevice());
}

Просмотреть файл

@ -47,6 +47,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template <typename ElementType>
class ComputationNode;
class File;
}}}
// TODO: The following should be reconciled with the equivalent code in the CNTK implementation
@ -100,9 +102,15 @@ namespace CNTK
// RuntimeError - throw a std::runtime_error with a formatted error string
#ifndef _MSC_VER // gcc __attribute__((format(printf())) does not percolate through variadic templates; so must go the macro route
#ifndef RuntimeError
#define RuntimeError ThrowFormatted<std::runtime_error>
#endif
#ifndef LogicError
#define LogicError ThrowFormatted<std::logic_error>
#endif
#ifndef InvalidArgument
#define InvalidArgument ThrowFormatted<std::invalid_argument>
#endif
#else
template <class... _Types>
__declspec_noreturn inline void RuntimeError(const char* format, _Types&&... _Args)
@ -158,4 +166,12 @@ namespace CNTK
class Function;
typedef std::shared_ptr<Function> FunctionPtr;
class Learner;
typedef std::shared_ptr<Learner> LearnerPtr;
class Dictionary;
class MinibatchSource;
typedef std::shared_ptr<MinibatchSource> MinibatchSourcePtr;
}

Просмотреть файл

@ -0,0 +1,274 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#include "CNTKLibrary.h"
#include "Function.h"
#include "ComputationNetworkBuilder.h"
#include "Utils.h"
#include "ComputationNode.h"
#include "InputAndParamNodes.h"
#include "NonlinearityNodes.h"
#include "LinearAlgebraNodes.h"
#include "RecurrentNodes.h"
#include "EvaluationNodes.h"
#include "TrainingNodes.h"
using namespace Microsoft::MSR::CNTK;
namespace CNTK
{
template <typename ElementType>
Variable GetVariable(const ComputationNodeBasePtr& node,
std::unordered_map<ComputationNodeBasePtr, Variable>& nodeToVariableMap,
std::unordered_map<Placeholder, Variable>& placeholderReplacements,
std::unordered_set<FunctionPtr>& allPrimitiveFunctions)
{
auto iter = nodeToVariableMap.find(node);
if (iter != nodeToVariableMap.end())
return iter->second;
Variable var;
NDShape varShape = AsNDShape(node->GetSampleLayout());
// The CNTK sample layouts may have trailing axes with dimension size of 1 which are automatically
// added when converting from NDShape to CNTK internal TensorShapes and are not present in the original
// shapes specified by the user. These should be truncated.
if (varShape.NumAxes() <= 2)
{
size_t numTrailingDimsToRemove = 0;
for (int i = varShape.NumAxes() - 1; i >= 0; --i)
{
if (varShape[i] == 1)
numTrailingDimsToRemove++;
else
break;
}
varShape = varShape.SubShape(0, varShape.NumAxes() - numTrailingDimsToRemove);
}
if (node->IsLeaf())
{
if (node->Is<InputValueBase<ElementType>>())
{
auto inputNode = node->As<InputValueBase<ElementType>>();
bool isSparse = node->Is<SparseInputValue<ElementType>>();
if (node->HasMBLayout())
{
// TODO: Currently only default dynamic axis is supported
const std::wstring defaultCNTKDynamicAxisName = L"";
if (inputNode->GetRequestedDynamicAxis() != defaultCNTKDynamicAxisName)
LogicError("Found dynamic axis named '%S' while currently only default dynamic axis named '%S' is supported!", node->GetMBLayout()->GetAxisName(), defaultCNTKDynamicAxisName.c_str());
var = Variable(varShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, node->GetName());
}
else
{
// TODO: Allow creating inputs without a dynamic axis
LogicError("Found InputNode with no dynamic axis which is currently unsupported");
}
}
else if (node->Is<LearnableParameter<ElementType>>())
{
bool isConstant = (node->GetLearningRateMultiplier() == 0);
auto& matrix = node->As<ComputationNode<ElementType>>()->Value();
auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), node->GetSampleLayout());
NDArrayViewPtr parameterValue = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), varShape, false, tensorView);
if (isConstant)
var = Constant(parameterValue, node->GetName());
else
var = Parameter(parameterValue, node->GetName());
}
else
LogicError("CNTK::LoadLegacyModel: Unsupported legacy CNTK node named '%S'", node->NodeName().c_str());
}
else
{
// This is a non-leaf node and maps to a primitive Function
auto placeholderVar = Placeholder(varShape);
nodeToVariableMap[node] = placeholderVar;
std::vector<Variable> inputVars(node->GetNumInputs());
for (size_t i = 0; i < inputVars.size(); ++i)
{
inputVars[i] = GetVariable<ElementType>(node->Input(i), nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions);
if (inputVars[i].IsPlaceholder())
placeholderReplacements[Placeholder(inputVars[i])] = Variable();
}
PrimitiveOpType opType;
Dictionary primitiveFunctionConfigParameters;
if (node->OperationName() == OperationNameOf(NegateNode))
opType = PrimitiveOpType::Negate;
else if (node->OperationName() == OperationNameOf(SigmoidNode))
opType = PrimitiveOpType::Sigmoid;
else if (node->OperationName() == OperationNameOf(TanhNode))
opType = PrimitiveOpType::Tanh;
else if (node->OperationName() == OperationNameOf(RectifiedLinearNode))
opType = PrimitiveOpType::ReLU;
else if (node->OperationName() == OperationNameOf(ExpNode))
opType = PrimitiveOpType::Exp;
else if (node->OperationName() == OperationNameOf(LogNode))
opType = PrimitiveOpType::Log;
else if (node->OperationName() == OperationNameOf(SqrtNode))
opType = PrimitiveOpType::Sqrt;
else if (node->OperationName() == OperationNameOf(FloorNode))
opType = PrimitiveOpType::Floor;
else if (node->OperationName() == OperationNameOf(AbsNode))
opType = PrimitiveOpType::Abs;
else if (node->OperationName() == OperationNameOf(ReciprocalNode))
opType = PrimitiveOpType::Reciprocal;
else if (node->OperationName() == OperationNameOf(SoftmaxNode))
opType = PrimitiveOpType::Softmax;
else if (node->OperationName() == OperationNameOf(PlusNode))
opType = PrimitiveOpType::Plus;
else if (node->OperationName() == OperationNameOf(MinusNode))
opType = PrimitiveOpType::Minus;
else if (node->OperationName() == OperationNameOf(ElementTimesNode))
opType = PrimitiveOpType::ElementTimes;
else if (node->OperationName() == OperationNameOf(EqualNode))
opType = PrimitiveOpType::Equal;
else if (node->OperationName() == OperationNameOf(NotEqualNode))
opType = PrimitiveOpType::NotEqual;
else if (node->OperationName() == OperationNameOf(LessNode))
opType = PrimitiveOpType::Less;
else if (node->OperationName() == OperationNameOf(LessEqualNode))
opType = PrimitiveOpType::LessEqual;
else if (node->OperationName() == OperationNameOf(GreaterNode))
opType = PrimitiveOpType::Greater;
else if (node->OperationName() == OperationNameOf(GreaterEqualNode))
opType = PrimitiveOpType::GreaterEqual;
else if (node->OperationName() == OperationNameOf(TimesNode))
{
primitiveFunctionConfigParameters[L"numOutputAxes"] = DictionaryValue((size_t)node->As<TimesNode<ElementType>>()->OutputRank());
opType = PrimitiveOpType::Times;
}
else if (node->OperationName() == OperationNameOf(PastValueNode))
{
if (inputVars.size() == 1)
{
auto initialStateVar = Constant({}, node->As<PastValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
inputVars.insert(inputVars.begin(), initialStateVar);
}
primitiveFunctionConfigParameters[L"stepSize"] = DictionaryValue((size_t)node->As<PastValueNode<ElementType>>()->TimeStep());
opType = PrimitiveOpType::PastValue;
}
else if (node->OperationName() == OperationNameOf(FutureValueNode))
{
if (inputVars.size() == 1)
{
auto initialStateVar = Constant({}, node->As<FutureValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
inputVars.insert(inputVars.begin(), initialStateVar);
}
primitiveFunctionConfigParameters[L"stepSize"] = DictionaryValue((size_t)node->As<FutureValueNode<ElementType>>()->TimeStep());
opType = PrimitiveOpType::FutureValue;
}
else if (node->OperationName() == OperationNameOf(SquareErrorNode))
opType = PrimitiveOpType::SquaredError;
else if (node->OperationName() == OperationNameOf(CrossEntropyWithSoftmaxNode))
{
std::swap(inputVars[0], inputVars[1]);
opType = PrimitiveOpType::CrossEntropyWithSoftmax;
}
else if (node->OperationName() == OperationNameOf(ErrorPredictionNode))
{
std::swap(inputVars[0], inputVars[1]);
opType = PrimitiveOpType::ClassificationError;
}
else if (node->OperationName() == OperationNameOf(SumElementsNode))
opType = PrimitiveOpType::ReduceSum;
else if (node->OperationName() == OperationNameOf(ConvolutionNode))
{
auto convolutionNode = node->As<ConvolutionNode<ElementType>>();
primitiveFunctionConfigParameters[L"strides"] = AsNDShape(convolutionNode->Strides());
primitiveFunctionConfigParameters[L"sharing"] = AsDictionaryValueVector(convolutionNode->Sharing());
primitiveFunctionConfigParameters[L"autoPadding"] = AsDictionaryValueVector(convolutionNode->AutoPad());
primitiveFunctionConfigParameters[L"lowerPad"] = AsNDShape(convolutionNode->LowerPad());
primitiveFunctionConfigParameters[L"upperPad"] = AsNDShape(convolutionNode->UpperPad());
primitiveFunctionConfigParameters[L"transpose"] = convolutionNode->Transpose();
primitiveFunctionConfigParameters[L"maxTempMemSizeInSamples"] = convolutionNode->MaxTempMemSizeInSamples();
opType = PrimitiveOpType::Convolution;
}
else if (node->OperationName() == OperationNameOf(PoolingNode))
{
auto poolingNode = node->As<PoolingNode<ElementType>>();
primitiveFunctionConfigParameters[L"poolingType"] = (size_t)(AsPoolingType(poolingNode->PoolingKind()));
primitiveFunctionConfigParameters[L"poolingWindowShape"] = AsNDShape(poolingNode->KernelShape());
primitiveFunctionConfigParameters[L"strides"] = AsNDShape(poolingNode->Strides());
primitiveFunctionConfigParameters[L"autoPadding"] = AsDictionaryValueVector(poolingNode->AutoPad());
primitiveFunctionConfigParameters[L"lowerPad"] = AsNDShape(poolingNode->LowerPad());
primitiveFunctionConfigParameters[L"upperPad"] = AsNDShape(poolingNode->UpperPad());
opType = PrimitiveOpType::Pooling;
}
else if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
{
auto batchNormalizationNode = node->As<BatchNormalizationNode<ElementType>>();
primitiveFunctionConfigParameters[L"spacial"] = batchNormalizationNode->Spatial();
primitiveFunctionConfigParameters[L"normalizationTimeConstant"] = batchNormalizationNode->NormalizationTimeConstant();
primitiveFunctionConfigParameters[L"blendTimeConstant"] = batchNormalizationNode->BlendTimeConstant();
primitiveFunctionConfigParameters[L"epsilon"] = batchNormalizationNode->Epsilon();
primitiveFunctionConfigParameters[L"useCuDNNEngine"] = !batchNormalizationNode->UseCNTKEngine();
opType = PrimitiveOpType::BatchNormalization;
}
else
LogicError("Unsupported ComputationNode with OperationName='%S' found when loading legacy CNTK model", node->OperationName().c_str());
FunctionPtr primitiveFunction = MakeSharedObject<PrimitiveFunction>(opType, inputVars, std::move(primitiveFunctionConfigParameters), node->GetName());
allPrimitiveFunctions.insert(primitiveFunction);
var = primitiveFunction->Output();
if (placeholderReplacements.find(placeholderVar) != placeholderReplacements.end())
placeholderReplacements[placeholderVar] = var;
}
nodeToVariableMap[node] = var;
return var;
}
template <typename ElementType>
FunctionPtr LoadLegacyModel(const std::wstring& modelFile, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/)
{
ComputationNetworkPtr net = make_shared<ComputationNetwork>(AsCNTKImplDeviceId(computeDevice));
net->Load<ElementType>(modelFile);
// Now traverse the model and construct the Function graph
std::unordered_map<ComputationNodeBasePtr, Variable> nodeToVariableMap;
std::unordered_map<Placeholder, Variable> placeholderReplacements;
std::unordered_set<FunctionPtr> allPrimitiveFunctions;
std::vector<FunctionPtr> rootFunctions;
auto& networkRoots = net->RootNodes();
for (auto& rootNode : networkRoots)
{
if (rootNode->IsLeaf())
continue;
rootFunctions.push_back(GetVariable<ElementType>(rootNode, nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions).Owner());
}
auto rootComposite = Combine(rootFunctions);
rootComposite->ReplacePlaceholders(placeholderReplacements);
return rootComposite;
}
template <typename ElementType>
void SaveAsLegacyModel(const FunctionPtr& rootFunction, const std::wstring& modelFile)
{
CompositeFunction* compositeFunction = dynamic_cast<CompositeFunction*>(rootFunction.get());
if (compositeFunction == nullptr)
InvalidArgument("Primitive (aka non-composite) Function instances cannot be saved");
auto computationNetwork = compositeFunction->GetComputationNetwork<ElementType>(DeviceDescriptor::CPUDevice(), {});
computationNetwork->Save(modelFile);
}
// Template instantiations
template CNTK_API FunctionPtr LoadLegacyModel<float>(const std::wstring& modelFile, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/);
template CNTK_API FunctionPtr LoadLegacyModel<double>(const std::wstring& modelFile, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/);
template CNTK_API void SaveAsLegacyModel<float>(const FunctionPtr& rootFunction, const std::wstring& modelFile);
template CNTK_API void SaveAsLegacyModel<double>(const FunctionPtr& rootFunction, const std::wstring& modelFile);
}

Просмотреть файл

@ -56,7 +56,7 @@
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<AdditionalIncludeDirectories>.\API;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>.\API;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<AdditionalLibraryDirectories>$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(MSMPI_LIB64);$(SolutionDir)$(Platform)\$(Configuration);$(NvmlLibPath)</AdditionalLibraryDirectories>
@ -75,7 +75,7 @@
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; SequenceTrainingLib.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; SequenceTrainingLib.lib; ReaderLib.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
<DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
</Link>
</ItemDefinitionGroup>
@ -99,7 +99,7 @@
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; ReaderLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
<Profile>true</Profile>
<DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
</Link>
@ -128,11 +128,14 @@
<ClInclude Include="API\CNTKLibrary.h" />
<ClInclude Include="API\CNTKLibraryInternals.h" />
<ClInclude Include="Function.h" />
<ClInclude Include="Learner.h" />
<ClInclude Include="MinibatchSource.h" />
<ClInclude Include="Utils.h" />
<ClInclude Include="stdafx.h" />
<ClInclude Include="targetver.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="BackCompat.cpp" />
<ClCompile Include="Common.cpp" />
<ClCompile Include="dllmain.cpp">
<CompileAsManaged>false</CompileAsManaged>
@ -140,11 +143,14 @@
</PrecompiledHeader>
</ClCompile>
<ClCompile Include="Function.cpp" />
<ClCompile Include="Learner.cpp" />
<ClCompile Include="MinibatchSource.cpp" />
<ClCompile Include="NDArrayView.cpp" />
<ClCompile Include="NDMask.cpp" />
<ClCompile Include="stdafx.cpp">
<PrecompiledHeader>Create</PrecompiledHeader>
</ClCompile>
<ClCompile Include="Trainer.cpp" />
<ClCompile Include="Utils.cpp" />
<ClCompile Include="Value.cpp" />
<ClCompile Include="Variable.cpp" />

Просмотреть файл

@ -10,6 +10,10 @@
<ClCompile Include="Variable.cpp" />
<ClCompile Include="Utils.cpp" />
<ClCompile Include="NDMask.cpp" />
<ClCompile Include="Learner.cpp" />
<ClCompile Include="BackCompat.cpp" />
<ClCompile Include="Trainer.cpp" />
<ClCompile Include="MinibatchSource.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="stdafx.h" />
@ -22,6 +26,8 @@
<Filter>API</Filter>
</ClInclude>
<ClInclude Include="Function.h" />
<ClInclude Include="Learner.h" />
<ClInclude Include="MinibatchSource.h" />
</ItemGroup>
<ItemGroup>
<Filter Include="API">

Просмотреть файл

@ -117,6 +117,7 @@ namespace CNTK
if (variable.IsParameter() || variable.IsConstant())
{
computationNodePtr = builder.CreateLearnableParameter(variable.Name(), AsTensorShape(variable.Shape()));
network->InitLearnableParameters(computationNodePtr, L"fixedValue", 0); // must call this to follow protocol; can overwrite later
if (!variable.NeedsGradient())
computationNodePtr->SetLearningRateMultiplier(0.0);
@ -126,7 +127,13 @@ namespace CNTK
}
else if (variable.IsInput())
{
// TODO: Specify dynamic axis
// TODO: Support inputs with > 1 dynamic axes
if (variable.DynamicAxes().size() != 1)
LogicError("Currently only Input variables with one dynamic axis are supported");
auto dynamicAxis = variable.DynamicAxes()[0];
if (dynamicAxis != Axis::DefaultDynamicAxis())
LogicError("Currently only Input variables with DefaultDynamicAxis are supported");
if (IsSparseInput(variable))
computationNodePtr = builder.CreateSparseInputNode(variable.Name(), AsTensorShape(variable.Shape()));
else
@ -164,6 +171,7 @@ namespace CNTK
if (dynamic_cast<PrimitiveFunction*>(function))
{
PrimitiveFunction* primitiveFunction = dynamic_cast<PrimitiveFunction*>(function);
auto functionConfig = primitiveFunction->FunctionConfig();
// Create the nodes corresponding to the inputs
auto functionInputs = primitiveFunction->Inputs();
@ -180,12 +188,8 @@ namespace CNTK
PrimitiveOpType op = primitiveFunction->OpType();
switch (op)
{
case PrimitiveOpType::Plus:
computationNodePtr = builder.Plus(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::Times:
// TODO: The output rank of the times operation is currently hardcoded to 1
computationNodePtr = builder.Times(input0Node, input1Node, 1, function->Name());
case PrimitiveOpType::Negate:
computationNodePtr = builder.Negate(input0Node, function->Name());
break;
case PrimitiveOpType::Sigmoid:
computationNodePtr = builder.Sigmoid(input0Node, function->Name());
@ -193,15 +197,100 @@ namespace CNTK
case PrimitiveOpType::Tanh:
computationNodePtr = builder.Tanh(input0Node, function->Name());
break;
case PrimitiveOpType::ReLU:
computationNodePtr = builder.RectifiedLinear(input0Node, function->Name());
break;
case PrimitiveOpType::Exp:
computationNodePtr = builder.Exp(input0Node, function->Name());
break;
case PrimitiveOpType::Log:
computationNodePtr = builder.Log(input0Node, function->Name());
break;
case PrimitiveOpType::Sqrt:
computationNodePtr = builder.Sqrt(input0Node, function->Name());
break;
case PrimitiveOpType::Floor:
computationNodePtr = builder.Floor(input0Node, function->Name());
break;
case PrimitiveOpType::Abs:
computationNodePtr = builder.Abs(input0Node, function->Name());
break;
case PrimitiveOpType::Reciprocal:
computationNodePtr = builder.Reciprocal(input0Node, function->Name());
break;
case PrimitiveOpType::Softmax:
if (functionInputs[0].Shape().NumAxes() > 1)
InvalidArgument("Softmax operation can only be applied to a 1D input");
computationNodePtr = builder.Softmax(input0Node, function->Name());
break;
case PrimitiveOpType::Pooling:
{
PoolingType poolingType = (PoolingType)(functionConfig[L"poolingType"].GetValue<size_t>());
auto poolingWindowsShape = functionConfig[L"poolingWindowShape"].GetValue<NDShape>();
auto strides = functionConfig[L"strides"].GetValue<NDShape>();
auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
computationNodePtr = builder.Pooling(input0Node, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape, true), AsTensorShape(strides, true), autoPadding, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true), ImageLayoutKind::CHW, function->Name());
break;
}
case PrimitiveOpType::Plus:
computationNodePtr = builder.Plus(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::Minus:
computationNodePtr = builder.Minus(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::ElementTimes:
computationNodePtr = builder.ElementTimes(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::Equal:
computationNodePtr = builder.Equal(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::NotEqual:
computationNodePtr = builder.NotEqual(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::Less:
computationNodePtr = builder.Less(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::LessEqual:
computationNodePtr = builder.LessEqual(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::Greater:
computationNodePtr = builder.Greater(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::GreaterEqual:
computationNodePtr = builder.GreaterEqual(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::Times:
{
size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
computationNodePtr = builder.Times(input0Node, input1Node, numOutputAxes, function->Name());
break;
}
case PrimitiveOpType::Convolution:
{
NDShape outputMapCount, kernelShape;
std::tie(outputMapCount, kernelShape) = GetConvolutionOutputMapCountAndKernelShape(functionInputs[0].Shape(), functionInputs[1].Shape());
auto strides = functionConfig[L"strides"].GetValue<NDShape>();
auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
auto sharing = AsBasicElementTypeVector<bool>(functionConfig[L"sharing"].GetValue<std::vector<DictionaryValue>>());
auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
auto transpose = functionConfig[L"transpose"].GetValue<bool>();
auto maxTempMemSizeInSamples = functionConfig[L"maxTempMemSizeInSamples"].GetValue<size_t>();
computationNodePtr = builder.Convolution(input0Node, input1Node, AsTensorShape(kernelShape, true), AsTensorShape(outputMapCount, true), AsTensorShape(strides, true), sharing, autoPadding, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true), transpose, ImageLayoutKind::CHW, maxTempMemSizeInSamples, function->Name());
break;
}
case PrimitiveOpType::SquaredError:
computationNodePtr = builder.SquareError(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::CrossEntropyWithSoftmax:
computationNodePtr = builder.CrossEntropyWithSoftmax(input1Node, input0Node, function->Name());
break;
case PrimitiveOpType::ClassificationError:
computationNodePtr = builder.ErrorPrediction(input1Node, input0Node, function->Name());
break;
case PrimitiveOpType::Exp:
computationNodePtr = builder.Exp(input0Node, function->Name());
break;
case PrimitiveOpType::PastValue:
case PrimitiveOpType::FutureValue:
{
@ -231,9 +320,6 @@ namespace CNTK
break;
}
case PrimitiveOpType::ElementTimes:
computationNodePtr = builder.ElementTimes(input0Node, input1Node, function->Name());
break;
case PrimitiveOpType::ReduceSum:
{
// TODO: Use the new ReduceElements node instead of the legacy SumElements node for reduction. Currently ReduceElements has incorrect MBLayout inference.
@ -241,6 +327,23 @@ namespace CNTK
computationNodePtr = builder.Sum(input0Node, function->Name());
break;
}
case PrimitiveOpType::BatchNormalization:
{
auto spacial = functionConfig[L"spacial"].GetValue<bool>();
auto normalizationTimeConstant = functionConfig[L"normalizationTimeConstant"].GetValue<double>();
auto blendTimeConstant = functionConfig[L"blendTimeConstant"].GetValue<double>();
auto epsilon = functionConfig[L"epsilon"].GetValue<double>();
auto useCuDNNEngine = functionConfig[L"useCuDNNEngine"].GetValue<bool>();
std::vector<std::shared_ptr<ComputationNode<ElementType>>> inputNodes;
for (auto inputVar : functionInputs)
{
auto baseNodePtr = GetNode(inputVar, network, builder, variableToNodeMap, isVariableRootMap);
inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr->template As<ComputationNode<ElementType>>()->shared_from_this() : nullptr);
}
computationNodePtr = builder.BatchNormalization(inputNodes[0], inputNodes[1], inputNodes[2], inputNodes[3], inputNodes[4], spacial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, ImageLayoutKind::CHW, function->Name());
break;
}
case PrimitiveOpType::Combine:
// This operation is just a no-op and is a means to combine multiple functions to create a single Function
// whose outputs are a union of tyhe outputs of the Functions being combined.
@ -351,7 +454,7 @@ namespace CNTK
auto outputShape = outputVar.Shape();
auto computationNodeSampleLayout = computationNodePtr->GetSampleLayout();
if (((outputShape.NumAxes() == 0) && (computationNodeSampleLayout[0] != 1)) ||
((outputShape.NumAxes() != 0) && (computationNodeSampleLayout != AsTensorShape(outputShape))))
((outputShape.NumAxes() != 0) && (computationNodeSampleLayout != AsTensorShape(outputShape)) && (computationNodeSampleLayout != AsTensorShape(outputShape, true))))
{
LogicError("The output Variable shape %s does not match the SampleLayout shape %s of the corresponding ComputationNode in the network", AsString(outputShape).c_str(), ((std::string)computationNodeSampleLayout).c_str());
}
@ -486,18 +589,9 @@ namespace CNTK
}
template <typename ElementType>
/*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout)
/*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(const NDShape& sampleShape, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/)
{
if (var.DynamicAxes().size() > 1)
LogicError("More than one dynamic axis for a variable is currently unsupported");
if (AsDataType<ElementType>() != var.GetDataType())
LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(var.GetDataType()));
if ((layout != nullptr) && (matrix.GetNumRows() != var.Shape().TotalSize()))
LogicError("Unexpected matrix layout: The number of rows in the matrix does not match the sample size of the Variable");
NDShape valueDataShape = var.Shape();
NDShape valueDataShape = sampleShape;
if (layout != nullptr)
valueDataShape = valueDataShape.AppendShape({ layout->GetNumTimeSteps(), layout->GetNumSequences() });
@ -506,7 +600,7 @@ namespace CNTK
{
// Just create a view over the existing matrix itself
auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), AsTensorShape(valueDataShape));
auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, true, tensorView);
auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, readOnly, tensorView);
return MakeSharedObject<Value>(data);
}
@ -565,10 +659,25 @@ namespace CNTK
}
auto tensorView = new TensorView<ElementType>(shuffledMatrixData, AsTensorShape(valueDataShape));
auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, true, tensorView);
auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, readOnly, tensorView);
return MakeSharedObject<Value>(data, mask);
}
template <typename ElementType>
/*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/)
{
if (var.DynamicAxes().size() > 1)
LogicError("More than one dynamic axis for a variable is currently unsupported");
if (AsDataType<ElementType>() != var.GetDataType())
LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(var.GetDataType()));
if ((layout != nullptr) && (matrix.GetNumRows() != var.Shape().TotalSize()))
LogicError("Unexpected matrix layout: The number of rows in the matrix does not match the sample size of the Variable");
return GetValueObjectFromCNTKImplMatrixAndMBLayout(var.Shape(), matrix, layout, readOnly);
}
template <typename ElementType>
/*static*/ void CompositeFunction::PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, ComputationNodeBasePtr& computationNode)
{
@ -583,7 +692,7 @@ namespace CNTK
computationNode->GetMBLayout()->CopyFrom(layout);
}
void CompositeFunction::PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments)
void CompositeFunction::PopulateNetworkInputs(const std::unordered_map<Variable, ValuePtr>& arguments)
{
auto functionArguments = this->Arguments();
std::vector<ComputationNodeBasePtr> inputNodes;
@ -628,7 +737,7 @@ namespace CNTK
}
// Assign the supplied gradients corresponding to the root(s) of the network to be backpropagated through the graph
void CompositeFunction::PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients)
void CompositeFunction::PopulateNetworkGradients(const std::unordered_map<Variable, ValuePtr>& gradients)
{
auto functionOutputs = this->Outputs();
for (auto gradientVarValuePair : gradients)
@ -676,45 +785,48 @@ namespace CNTK
return NDShape(outputShapeDims);
}
/*static*/ void CompositeFunction::GetNodeOutputOrGradient(Variable var, ValuePtr& varValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode, bool getGradient)
{
auto valueShape = GetValueShape(var, computationNode);
if (varValue != nullptr)
{
// TODO: The shape of the specified output Value object must match the actual output shape
if (varValue->Data()->Shape() != valueShape)
InvalidArgument("The shape %s of the specified Value object for %s does not match the actual shape %s", AsString(varValue->Data()->Shape()).c_str(), getGradient ? "gradient" : "output", AsString(valueShape).c_str());
}
ValuePtr nodeValue;
switch (var.GetDataType())
{
case DataType::Float:
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(var,
getGradient ? computationNode->As<ComputationNode<float>>()->Gradient() : computationNode->As<ComputationNode<float>>()->Value(),
computationNode->GetMBLayout());
break;
case DataType::Double:
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(var,
getGradient ? computationNode->As<ComputationNode<double>>()->Gradient() : computationNode->As<ComputationNode<double>>()->Value(),
computationNode->GetMBLayout());
break;
default:
LogicError("Unsupported DataType %s", DataTypeName(var.GetDataType()));
break;
}
if (varValue == nullptr)
{
auto data = MakeSharedObject<NDArrayView>(var.GetDataType(), valueShape, AsDeviceDescriptor(computationNode->ValuePtr()->GetDeviceId()));
auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
varValue = MakeSharedObject<Value>(data, mask);
}
varValue->CopyFrom(*nodeValue);
}
void CompositeFunction::GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs)
{
// Now copy the Forward values of output nodes from the network to outputs' Value objects
for (auto outputVarValuePair : outputs)
{
auto computationNodePtr = m_variableToNodeMap[outputVarValuePair.first];
auto outputValuePtr = outputVarValuePair.second;
auto outputShape = GetValueShape(outputVarValuePair.first, computationNodePtr);
if (outputValuePtr != nullptr)
{
// TODO: The shape of the specified output Value object must match the actual output shape
if (outputValuePtr->Data()->Shape() != outputShape)
InvalidArgument("The shape %s of the specified Value object for output does not match the actual output shape %s", AsString(outputValuePtr->Data()->Shape()).c_str(), AsString(outputShape).c_str());
}
ValuePtr nodeValue;
switch (outputVarValuePair.first.GetDataType())
{
case DataType::Float:
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Value(), computationNodePtr->GetMBLayout());
break;
case DataType::Double:
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Value(), computationNodePtr->GetMBLayout());
break;
default:
LogicError("Unsupported DataType %s", DataTypeName(outputVarValuePair.first.GetDataType()));
break;
}
if (outputValuePtr == nullptr)
{
auto data = MakeSharedObject<NDArrayView>(outputVarValuePair.first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
outputValuePtr = MakeSharedObject<Value>(data, mask);
}
outputValuePtr->CopyFrom(*nodeValue);
outputs[outputVarValuePair.first] = outputValuePtr;
}
GetNodeOutputOrGradient(outputVarValuePair.first, outputs[outputVarValuePair.first], m_variableToNodeMap[outputVarValuePair.first], false /*getGradient*/);
}
void CompositeFunction::GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients)
@ -732,46 +844,15 @@ namespace CNTK
InvalidArgument("Gradient value incorrectly requested for an Output or Constant Variable, or an Input Variable with NeedsGradient setting of false");
auto computationNodePtr = m_variableToNodeMap[gradientVarValuePair.first];
auto gradientValuePtr = gradientVarValuePair.second;
auto gradientShape = GetValueShape(gradientVarValuePair.first, computationNodePtr);
if (gradientValuePtr != nullptr)
{
// TODO: The shape of the specified output Value object must match the actual output shape
if (gradientValuePtr->Data()->Shape() != gradientShape)
InvalidArgument("The shape %s of the specified Value object for gradient does not match the actual gradient shape %s", AsString(gradientValuePtr->Data()->Shape()).c_str(), AsString(gradientShape).c_str());
}
if (!computationNodePtr->NeedsGradient())
LogicError("Backpropagated gradient value cannot be read from a ComputationNode that has NeedsGradient set to false");
ValuePtr nodeValue;
switch (gradientVarValuePair.first.GetDataType())
{
case DataType::Float:
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Gradient(), computationNodePtr->GetMBLayout());
break;
case DataType::Double:
nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Gradient(), computationNodePtr->GetMBLayout());
break;
default:
LogicError("Unsupported DataType %s", DataTypeName(gradientVarValuePair.first.GetDataType()));
break;
}
if (gradientValuePtr == nullptr)
{
auto data = MakeSharedObject<NDArrayView>(gradientVarValuePair.first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
gradientValuePtr = MakeSharedObject<Value>(data, mask);
}
gradientValuePtr->CopyFrom(*nodeValue);
gradients[gradientVarValuePair.first] = gradientValuePtr;
GetNodeOutputOrGradient(gradientVarValuePair.first, gradients[gradientVarValuePair.first], computationNodePtr, true /*getGradient*/);
}
}
/*virtual*/ BackPropStatePtr CompositeFunction::Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
/*virtual*/ BackPropStatePtr CompositeFunction::Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
std::unordered_map<Variable, ValuePtr>& outputs,
const DeviceDescriptor& computeDevice,
const std::unordered_set<Variable>& outputsToRetainBackwardStateFor)
@ -809,6 +890,8 @@ namespace CNTK
outputsToEvaluate.push_back(m_variableToNodeMap[rootVarForBackprop]);
}
ScopedNetworkOperationMode modeGuard(m_computationNetwork, outputsToRetainBackwardStateFor.empty() ? NetworkOperationMode::inferring : NetworkOperationMode::training);
m_computationNetwork->ForwardProp(outputsToEvaluate);
GetNetworkOutputs(outputs);
@ -819,7 +902,7 @@ namespace CNTK
}
/*virtual*/ void CompositeFunction::Backward(const BackPropStatePtr& state,
const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs)
{
auto backpropState = dynamic_cast<const CNTKBackPropState*>(state.get());
@ -844,6 +927,8 @@ namespace CNTK
PopulateNetworkGradients(rootGradientValues);
// Backpropagate through the network
ScopedNetworkOperationMode modeGuard(m_computationNetwork, NetworkOperationMode::training);
auto rootComputationNodePtr = m_variableToNodeMap[rootGradientValues.begin()->first];
m_computationNetwork->GetNestedNetwork(rootComputationNodePtr)->Backprop(FrameRange(nullptr), true, true);
@ -852,27 +937,261 @@ namespace CNTK
// TODO: How to deal with the specified 'computeDevice'
}
FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
FunctionPtr UnaryOp(PrimitiveOpType op, const Variable& operand, Dictionary&& opConfig, const std::wstring& name)
{
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Times, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(op, std::vector<Variable>({ operand }), std::move(opConfig), name), name);
}
FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
FunctionPtr Negate(const Variable& operand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Plus, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
return UnaryOp(PrimitiveOpType::Negate, operand, Dictionary(), name);
}
FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Sigmoid, std::vector<Variable>({ operand }), Dictionary(), name), name);
return UnaryOp(PrimitiveOpType::Sigmoid, operand, Dictionary(), name);
}
FunctionPtr Tanh(const Variable& operand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Tanh, std::vector<Variable>({ operand }), Dictionary(), name), name);
return UnaryOp(PrimitiveOpType::Tanh, operand, Dictionary(), name);
}
FunctionPtr Combine(const std::initializer_list<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
FunctionPtr ReLU(const Variable& operand, const std::wstring& name/* = L""*/)
{
return UnaryOp(PrimitiveOpType::ReLU, operand, Dictionary(), name);
}
FunctionPtr Exp(const Variable& operand, const std::wstring& name/* = L""*/)
{
return UnaryOp(PrimitiveOpType::Exp, operand, Dictionary(), name);
}
FunctionPtr Log(const Variable& operand, const std::wstring& name/* = L""*/)
{
return UnaryOp(PrimitiveOpType::Log, operand, Dictionary(), name);
}
FunctionPtr Square(const Variable& operand, const std::wstring& name/* = L""*/)
{
return ElementTimes(operand, operand, name);
}
FunctionPtr Sqrt(const Variable& operand, const std::wstring& name/* = L""*/)
{
return UnaryOp(PrimitiveOpType::Sqrt, operand, Dictionary(), name);
}
FunctionPtr Round(const Variable& operand, const std::wstring& name/* = L""*/)
{
return Floor(Plus(operand, Constant(NDShape({}), 0.5f)), name);
}
FunctionPtr Floor(const Variable& operand, const std::wstring& name/* = L""*/)
{
return UnaryOp(PrimitiveOpType::Floor, operand, Dictionary(), name);
}
FunctionPtr Ceil(const Variable& operand, const std::wstring& name/* = L""*/)
{
return Negate(Floor(Negate(operand)), name);
}
FunctionPtr Abs(const Variable& operand, const std::wstring& name/* = L""*/)
{
return UnaryOp(PrimitiveOpType::Abs, operand, Dictionary(), name);
}
FunctionPtr Reciprocal(const Variable& operand, const std::wstring& name/* = L""*/)
{
return UnaryOp(PrimitiveOpType::Reciprocal, operand, Dictionary(), name);
}
FunctionPtr Softmax(const Variable& operand, const std::wstring& name/* = L""*/)
{
return UnaryOp(PrimitiveOpType::Softmax, operand, Dictionary(), name);
}
FunctionPtr BinaryOp(PrimitiveOpType op, const Variable& leftOperand, const Variable& rightOperand, Dictionary&& opConfig, const std::wstring& name)
{
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(op, std::vector<Variable>({ leftOperand, rightOperand }), std::move(opConfig), name), name);
}
FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::Plus, leftOperand, rightOperand, Dictionary(), name);
}
FunctionPtr Minus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::Minus, leftOperand, rightOperand, Dictionary(), name);
}
FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::ElementTimes, leftOperand, rightOperand, Dictionary(), name);
}
FunctionPtr ElementDivide(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return ElementTimes(leftOperand, Reciprocal(rightOperand), name);
}
FunctionPtr Equal(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::Equal, leftOperand, rightOperand, Dictionary(), name);
}
FunctionPtr NotEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::NotEqual, leftOperand, rightOperand, Dictionary(), name);
}
FunctionPtr Less(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::Less, leftOperand, rightOperand, Dictionary(), name);
}
FunctionPtr LessEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::LessEqual, leftOperand, rightOperand, Dictionary(), name);
}
FunctionPtr Greater(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::Greater, leftOperand, rightOperand, Dictionary(), name);
}
FunctionPtr GreaterEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::GreaterEqual, leftOperand, rightOperand, Dictionary(), name);
}
FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes /*= 1*/, const std::wstring& name/* = L""*/)
{
auto additionalProperties = Dictionary();
additionalProperties[L"numOutputAxes"] = numOutputAxes;
return BinaryOp(PrimitiveOpType::Times, leftOperand, rightOperand, std::move(additionalProperties), name);
}
FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::SquaredError, prediction, targets, Dictionary(), name);
}
FunctionPtr CrossEntropyWithSoftmax(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::CrossEntropyWithSoftmax, prediction, labels, Dictionary(), name);
}
FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
{
return BinaryOp(PrimitiveOpType::ClassificationError, prediction, labels, Dictionary(), name);
}
FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
{
if (operand.DynamicAxes().size() != 1)
InvalidArgument("PastValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
auto additionalProperties = Dictionary();
additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
return BinaryOp(PrimitiveOpType::PastValue, initialState, operand, std::move(additionalProperties), name);
}
FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
{
if (operand.DynamicAxes().size() != 1)
InvalidArgument("FutureValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
auto additionalProperties = Dictionary();
additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
return BinaryOp(PrimitiveOpType::FutureValue, initialState, operand, std::move(additionalProperties), name);
}
FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name/* = L""*/)
{
return UnaryOp(PrimitiveOpType::ReduceSum, operand, Dictionary(), name);
}
FunctionPtr PerDimMeanVarianceNormalize(const Variable& operand, const NDArrayViewPtr& mean, const NDArrayViewPtr& invStdDev, const std::wstring& name /*= L""*/)
{
Constant meanVar(mean);
Constant invStdDevVar(invStdDev);
return ElementTimes(Minus(operand, meanVar), invStdDevVar);
}
FunctionPtr Convolution(const Variable& convolutionMap,
const Variable& operand,
const NDShape& strides,
const std::vector<bool>& sharing,
const std::vector<bool>& autoPadding,
const NDShape& lowerPad,
const NDShape& upperPad,
bool transpose,
size_t maxTempMemSizeInSamples,
const std::wstring& name)
{
auto additionalProperties = Dictionary();
additionalProperties[L"strides"] = strides;
additionalProperties[L"sharing"] = AsDictionaryValueVector(sharing);
additionalProperties[L"autoPadding"] = AsDictionaryValueVector(autoPadding);
additionalProperties[L"lowerPad"] = lowerPad;
additionalProperties[L"upperPad"] = upperPad;
additionalProperties[L"transpose"] = transpose;
additionalProperties[L"maxTempMemSizeInSamples"] = maxTempMemSizeInSamples;
return BinaryOp(PrimitiveOpType::Convolution, convolutionMap, operand, std::move(additionalProperties), name);
}
FunctionPtr Pooling(const Variable& operand,
PoolingType poolingType,
const NDShape& poolingWindowShape,
const NDShape& strides,
const std::vector<bool>& autoPadding,
const NDShape& lowerPad,
const NDShape& upperPad,
const std::wstring& name)
{
auto additionalProperties = Dictionary();
additionalProperties[L"poolingType"] = (size_t)poolingType;
additionalProperties[L"poolingWindowShape"] = poolingWindowShape;
additionalProperties[L"strides"] = strides;
additionalProperties[L"autoPadding"] = AsDictionaryValueVector(autoPadding);
additionalProperties[L"lowerPad"] = lowerPad;
additionalProperties[L"upperPad"] = upperPad;
return UnaryOp(PrimitiveOpType::Pooling, operand, std::move(additionalProperties), name);
}
FunctionPtr BatchNormalization(const Variable& operand,
const Variable& scale,
const Variable& bias,
const Variable& runningMean,
const Variable& runningInvStd,
bool spacial,
double normalizationTimeConstant,
double blendTimeConstant,
double epsilon,
bool useCuDNNEngine,
const std::wstring& name)
{
auto additionalProperties = Dictionary();
additionalProperties[L"spacial"] = spacial;
additionalProperties[L"normalizationTimeConstant"] = normalizationTimeConstant;
additionalProperties[L"blendTimeConstant"] = blendTimeConstant;
additionalProperties[L"epsilon"] = epsilon;
additionalProperties[L"useCuDNNEngine"] = useCuDNNEngine;
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::BatchNormalization,
std::vector<Variable>({ operand, scale, bias, runningMean, runningInvStd }),
std::move(additionalProperties),
name),
name);
}
FunctionPtr Combine(const std::vector<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
{
std::unordered_set<FunctionPtr> uniqueOperands;
std::vector<Variable> inputs;
@ -888,49 +1207,4 @@ namespace CNTK
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Combine, inputs, Dictionary(), name), name);
}
FunctionPtr CrossEntropyWithSoftmax(const Variable& output, const Variable& labels, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::CrossEntropyWithSoftmax, std::vector<Variable>({ output, labels }), Dictionary(), name), name);
}
FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ClassificationError, std::vector<Variable>({ prediction, labels }), Dictionary(), name), name);
}
FunctionPtr Exp(const Variable& operand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Exp, std::vector<Variable>({ operand }), Dictionary(), name), name);
}
FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
{
if (operand.DynamicAxes().size() != 1)
InvalidArgument("PastValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
auto additionalProperties = Dictionary();
additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::PastValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
}
FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
{
if (operand.DynamicAxes().size() != 1)
InvalidArgument("FutureValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
auto additionalProperties = Dictionary();
additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::FutureValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
}
FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ElementTimes, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
}
FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name/* = L""*/)
{
return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ReduceSum, std::vector<Variable>({ operand }), Dictionary(), name), name);
}
}

Просмотреть файл

@ -10,65 +10,110 @@
#include <iterator>
#include "ComputationNetwork.h"
#include "Utils.h"
#include "ConvolveGeometry.h"
namespace CNTK
{
enum class PrimitiveOpType
enum class PrimitiveOpType : unsigned int
{
Plus,
Times,
Negate,
Sigmoid,
Tanh,
Combine,
ReLU,
Exp,
Log,
Sqrt,
Floor,
Abs,
Reciprocal,
Softmax,
Pooling,
Plus,
Minus,
ElementTimes,
Equal,
NotEqual,
Less,
LessEqual,
Greater,
GreaterEqual,
Times,
Convolution,
SquaredError,
CrossEntropyWithSoftmax,
ClassificationError,
Exp,
PastValue,
FutureValue,
ElementTimes,
ReduceSum
ReduceSum,
BatchNormalization,
Combine,
};
}
namespace std
{
template <> struct hash<CNTK::PrimitiveOpType>
{
size_t operator()(const CNTK::PrimitiveOpType& x) const
{
return std::hash<unsigned int>()((unsigned int)x);
}
};
}
namespace CNTK
{
inline const char* PrimitiveOpTypeName(PrimitiveOpType opType)
{
// TODO: Put these in table form
if (opType == PrimitiveOpType::Plus)
return "Plus";
else if (opType == PrimitiveOpType::Times)
return "Times";
else if (opType == PrimitiveOpType::Sigmoid)
return "Sigmoid";
else if (opType == PrimitiveOpType::Tanh)
return "Tanh";
else if (opType == PrimitiveOpType::Combine)
return "Combine";
else if (opType == PrimitiveOpType::CrossEntropyWithSoftmax)
return "CrossEntropyWithSoftmax";
else if (opType == PrimitiveOpType::ClassificationError)
return "ClassificationError";
else if (opType == PrimitiveOpType::Exp)
return "Exp";
else if (opType == PrimitiveOpType::PastValue)
return "PastValue";
else if (opType == PrimitiveOpType::FutureValue)
return "FutureValue";
else if (opType == PrimitiveOpType::ElementTimes)
return "ElementTimes";
else if (opType == PrimitiveOpType::ReduceSum)
return "ReduceSum";
else
static std::unordered_map<PrimitiveOpType, const char*> primitiveOpNames = {
{ PrimitiveOpType::Negate, "Negate" },
{ PrimitiveOpType::Sigmoid, "Sigmoid" },
{ PrimitiveOpType::Tanh, "Tanh" },
{ PrimitiveOpType::ReLU, "ReLU" },
{ PrimitiveOpType::Exp, "Exp" },
{ PrimitiveOpType::Log, "Log" },
{ PrimitiveOpType::Sqrt, "Sqrt" },
{ PrimitiveOpType::Floor, "Floor" },
{ PrimitiveOpType::Abs, "Abs" },
{ PrimitiveOpType::Reciprocal, "Reciprocal" },
{ PrimitiveOpType::Softmax, "Softmax" },
{ PrimitiveOpType::Pooling, "Pooling" },
{ PrimitiveOpType::Plus, "Plus" },
{ PrimitiveOpType::Minus, "Minus" },
{ PrimitiveOpType::ElementTimes, "ElementTimes" },
{ PrimitiveOpType::Equal, "Equal" },
{ PrimitiveOpType::NotEqual, "NotEqual" },
{ PrimitiveOpType::Less, "Less" },
{ PrimitiveOpType::LessEqual, "LessEqual" },
{ PrimitiveOpType::Greater, "Greater" },
{ PrimitiveOpType::GreaterEqual, "GreaterEqual" },
{ PrimitiveOpType::Times, "Times" },
{ PrimitiveOpType::Convolution, "Convolution" },
{ PrimitiveOpType::SquaredError, "SquaredError" },
{ PrimitiveOpType::CrossEntropyWithSoftmax, "CrossEntropyWithSoftmax" },
{ PrimitiveOpType::ClassificationError, "ClassificationError" },
{ PrimitiveOpType::PastValue, "PastValue" },
{ PrimitiveOpType::FutureValue, "FutureValue" },
{ PrimitiveOpType::ReduceSum, "ReduceSum" },
{ PrimitiveOpType::BatchNormalization, "BatchNormalization" },
{ PrimitiveOpType::Combine, "Combine" }
};
if (primitiveOpNames.find(opType) == primitiveOpNames.end())
LogicError("Unknown PrimitiveOpType");
return primitiveOpNames.find(opType)->second;
}
class PrimitiveFunction final : public Function
{
public:
PrimitiveFunction(PrimitiveOpType op, const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& functionName = L"")
: Function(inputs, GetOutputVariables(op, inputs, this), nullptr, functionName), m_op(op), m_functionConfig(std::move(functionConfig))
: Function(inputs, GetOutputVariables(op, inputs, this, functionConfig), nullptr, functionName), m_op(op), m_functionConfig(std::move(functionConfig))
{
}
virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& /*arguments*/,
virtual BackPropStatePtr Forward(const std::unordered_map<Variable, ValuePtr>& /*arguments*/,
std::unordered_map<Variable, ValuePtr>& /*outputs*/,
const DeviceDescriptor& /*computeDevice*/,
const std::unordered_set<Variable>& /*outputsToRetainBackwardStateFor*/) override
@ -77,7 +122,7 @@ namespace CNTK
}
virtual void Backward(const BackPropStatePtr& /*state*/,
const std::unordered_map<Variable, const ValuePtr>& /*rootGradientValues*/,
const std::unordered_map<Variable, ValuePtr>& /*rootGradientValues*/,
std::unordered_map<Variable, ValuePtr>& /*backPropagatedGradientValuesForInputs*/) override
{
NOT_IMPLEMENTED;
@ -131,25 +176,28 @@ namespace CNTK
return NDShape(std::move(outputDims));
}
static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape)
static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape, size_t numOutputAxes)
{
if (rightOperandShape.NumAxes() > 2)
RuntimeError("The right operand of a times operation can have at most 2 axes");
if (numOutputAxes == 0)
InvalidArgument("Output #axes of times operation should be at least one");
size_t numOutputAxes = rightOperandShape.NumAxes();
if (numOutputAxes > leftOperandShape.NumAxes())
InvalidArgument("Output #axes of times operation can at most be the #axes of the left operand");
if (leftOperandShape.NumAxes() != 2)
RuntimeError("The left operand of a times operation must have 2 axes");
size_t numReductionAxes = leftOperandShape.NumAxes() - numOutputAxes;
std::vector<size_t> outputDims(numOutputAxes);
outputDims[0] = leftOperandShape[0];
if (numOutputAxes > 1)
outputDims[1] = rightOperandShape[1];
// The 'numReductionAxes' trailing dimensions of the left operand's shape must match the corresponding leading
// dimensions of the right operand
if (leftOperandShape[1] != rightOperandShape[0])
RuntimeError("Left operand's shape %s is not compatible with right operand's shape %s for the times operation", AsString(leftOperandShape).c_str(), AsString(rightOperandShape).c_str());
if (rightOperandShape.NumAxes() != numReductionAxes)
RuntimeError("The right operand's #axes in a times operation should equal #axes being reduced over!");
return NDShape(std::move(outputDims));
if (leftOperandShape.SubShape(numOutputAxes) != rightOperandShape)
InvalidArgument("The trailing dimensions of the left operand (%s) do not match the right operand's dimensions (%s)",
AsString(leftOperandShape.SubShape(numOutputAxes)).c_str(),
AsString(rightOperandShape).c_str());
return leftOperandShape.SubShape(0, numOutputAxes);
}
static NDShape ReductionOpOutputShape(PrimitiveOpType op, const NDShape& operandShape, const std::vector<size_t>& reductionAxes)
@ -171,8 +219,22 @@ namespace CNTK
return NDShape(std::move(outputDims));
}
static NDShape ConvolutionOpOutputShape(const NDShape& operandShape, const NDShape& kernelShape, const NDShape& outputMapCount, const NDShape& strides,
const std::vector<bool>& sharing,
std::vector<bool>& autoPad, const NDShape& lowerPad, const NDShape& upperPad,
bool transpose)
{
decltype(&Microsoft::MSR::CNTK::ConvolveGeometry::ComputeOutputShape) computeOutputShapeFunc;
if (!transpose)
computeOutputShapeFunc = &Microsoft::MSR::CNTK::ConvolveGeometry::ComputeOutputShape;
else
computeOutputShapeFunc = &Microsoft::MSR::CNTK::ConvolveGeometry::ComputeInputShape;
return AsNDShape(computeOutputShapeFunc(AsTensorShape(operandShape, true), AsTensorShape(kernelShape, true), AsTensorShape(outputMapCount, true), AsTensorShape(strides, true), sharing, autoPad, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true)));
}
// TODO: Reconcile this with the ComputationNode::Validate functionality in core CNTK to avoid duplication of inference logic
static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner)
static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner, const Dictionary& functionConfig)
{
std::vector<Variable> outputs;
@ -195,32 +257,79 @@ namespace CNTK
switch (op)
{
case PrimitiveOpType::Negate:
case PrimitiveOpType::Sigmoid:
case PrimitiveOpType::Tanh:
case PrimitiveOpType::ReLU:
case PrimitiveOpType::Exp:
case PrimitiveOpType::Log:
case PrimitiveOpType::Sqrt:
case PrimitiveOpType::Floor:
case PrimitiveOpType::Abs:
case PrimitiveOpType::Reciprocal:
case PrimitiveOpType::Softmax:
assert(inputs.size() == 1);
outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, outputDynamicAxes));
break;
case PrimitiveOpType::PastValue:
case PrimitiveOpType::FutureValue:
assert(inputs.size() == 2);
outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
case PrimitiveOpType::Pooling:
{
assert(inputs.size() == 1);
auto poolingWindowsShape = functionConfig[L"poolingWindowShape"].GetValue<NDShape>();
auto strides = functionConfig[L"strides"].GetValue<NDShape>();
auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
outputs.push_back(Variable(ConvolutionOpOutputShape(inputs[0].Shape(), poolingWindowsShape, { 1 }, strides, { true }, autoPadding, lowerPad, upperPad, false), outputDataType, owner, outputDynamicAxes));
break;
}
case PrimitiveOpType::Plus:
case PrimitiveOpType::Minus:
case PrimitiveOpType::ElementTimes:
case PrimitiveOpType::Equal:
case PrimitiveOpType::NotEqual:
case PrimitiveOpType::Less:
case PrimitiveOpType::LessEqual:
case PrimitiveOpType::Greater:
case PrimitiveOpType::GreaterEqual:
assert(inputs.size() == 2);
outputs.push_back(Variable(BinaryElementwiseOpOutputShape(op, inputs[0].Shape(), inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
break;
case PrimitiveOpType::Times:
{
assert(inputs.size() == 2);
outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
// TODO: Support dynamic axes on the left operand
if (!inputs[0].DynamicAxes().empty())
LogicError("Dynamic axes are currently unsupported for left operand of a Times operation");
size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape(), numOutputAxes), outputDataType, owner, outputDynamicAxes));
break;
}
case PrimitiveOpType::Convolution:
{
assert(inputs.size() == 2);
auto strides = functionConfig[L"strides"].GetValue<NDShape>();
auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
auto sharing = AsBasicElementTypeVector<bool>(functionConfig[L"sharing"].GetValue<std::vector<DictionaryValue>>());
auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
bool transpose = functionConfig[L"transpose"].GetValue<bool>();
if (inputs[0].Shape().NumAxes() < inputs[1].Shape().NumAxes())
InvalidArgument("The convolution map should have at least as many axes as the shape of the input it operates on!");
NDShape outputMapCount, kernelShape;
std::tie(outputMapCount, kernelShape) = GetConvolutionOutputMapCountAndKernelShape(inputs[0].Shape(), inputs[1].Shape());
outputs.push_back(Variable(ConvolutionOpOutputShape(inputs[1].Shape(), kernelShape, outputMapCount, strides, sharing, autoPadding, lowerPad, upperPad, transpose), outputDataType, owner, outputDynamicAxes));
break;
}
case PrimitiveOpType::SquaredError:
case PrimitiveOpType::CrossEntropyWithSoftmax:
case PrimitiveOpType::ClassificationError:
{
assert(inputs.size() == 2);
if (inputs[0].Shape().NumAxes() > 1)
if ((inputs[0].Shape().NumAxes() > 2) || ((inputs[0].Shape().NumAxes() > 1) && (inputs[0].Shape()[1] != 1)))
InvalidArgument("The shape of input operands for the %s operation should have at most one axis", PrimitiveOpTypeName(op));
auto predictionShape = inputs[0].Shape();
@ -235,6 +344,11 @@ namespace CNTK
outputs.push_back(Variable(ReductionOpOutputShape(op, predictionShape, reductionAxes), outputDataType, owner, {}));
break;
}
case PrimitiveOpType::PastValue:
case PrimitiveOpType::FutureValue:
assert(inputs.size() == 2);
outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
break;
case PrimitiveOpType::ReduceSum:
{
assert(inputs.size() == 1);
@ -249,6 +363,9 @@ namespace CNTK
outputs.push_back(Variable(ReductionOpOutputShape(op, inputs[0].Shape(), reductionAxes), outputDataType, owner, reductionOutputDynamicAxes));
break;
}
case PrimitiveOpType::BatchNormalization:
outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, outputDynamicAxes));
break;
case PrimitiveOpType::Combine:
outputs = inputs;
break;
@ -288,10 +405,18 @@ namespace CNTK
class CompositeFunction final : public Function
{
friend class Function;
friend class CompositeMinibatchSource;
template <typename T, typename ...CtorArgTypes>
friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
template <typename ElementType>
friend void SaveAsLegacyModel(const FunctionPtr& rootFunction, const std::wstring& modelFile);
friend void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndInvStdDevs,
const DeviceDescriptor& device /*= DeviceDescriptor::CPUDevice()*/);
public:
static CompositeFunctionPtr Create(const FunctionPtr& rootFunction, const std::wstring& name = L"")
{
@ -303,13 +428,13 @@ namespace CNTK
return MakeSharedObject<CompositeFunction>(rootFunction, std::move(visitedFunctions), name);
}
virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
virtual BackPropStatePtr Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
std::unordered_map<Variable, ValuePtr>& outputs,
const DeviceDescriptor& computeDevice,
const std::unordered_set<Variable>& outputsToRetainBackwardStateFor) override;
virtual void Backward(const BackPropStatePtr& state,
const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override;
private:
@ -361,12 +486,13 @@ namespace CNTK
template <typename ElementType>
static void PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
void PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments);
void PopulateNetworkInputs(const std::unordered_map<Variable, ValuePtr>& arguments);
template <typename ElementType>
static void PopulateComputationNodeGradient(const std::pair<Variable, ValuePtr>& variableGradient, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
void PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients);
void PopulateNetworkGradients(const std::unordered_map<Variable, ValuePtr>& gradients);
static void GetNodeOutputOrGradient(Variable var, ValuePtr& varValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode, bool getGradient);
void GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs);
void GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients);
@ -374,7 +500,9 @@ namespace CNTK
static std::pair<std::shared_ptr<const Microsoft::MSR::CNTK::Matrix<ElementType>>, Microsoft::MSR::CNTK::MBLayoutPtr> GetCNTKImplMatrixAndMBLayoutFromValueObject(Variable var, const ValuePtr& value);
template <typename ElementType>
static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout);
static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(const NDShape& sampleShape, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout, bool readOnly = true);
template <typename ElementType>
static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout, bool readOnly = true);
private:

Просмотреть файл

@ -0,0 +1,451 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "Learner.h"
#include "TensorView.h"
#include "Utils.h"
#define UPDATE_FUNCTION \
switch (smoothedGradientValue->GetDataType()) \
{ \
case DataType::Float: \
Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
break; \
case DataType::Double: \
Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
break; \
default: \
NOT_IMPLEMENTED; \
}
using namespace Microsoft::MSR::CNTK;
using namespace std;
namespace CNTK
{
template <typename ElementType>
/*static*/ shared_ptr<const Matrix<ElementType>> LearnerBase::GetMatrix(const NDArrayViewPtr& arrayView)
{
return arrayView->GetMatrix<ElementType>();
}
template <typename ElementType>
/*static*/ shared_ptr<Matrix<ElementType>> LearnerBase::GetWritableMatrix(const NDArrayViewPtr& arrayView)
{
return arrayView->GetWritableMatrix<ElementType>();
}
template <typename ElementType>
/*static*/ const TensorView<ElementType>* LearnerBase::GetTensorView(const NDArrayViewPtr& arrayView)
{
return arrayView->GetTensorView<ElementType>();
}
/*static*/ bool LearnerBase::HasNan(const NDArrayViewPtr& value, const char* name)
{
switch (value->GetDataType())
{
case DataType::Float:
return value->GetMatrix<float>()->HasNan(name);
case DataType::Double:
return value->GetMatrix<double>()->HasNan(name);
default:
LogicError("Unsupported DataType %s", DataTypeName(value->GetDataType()));
}
}
/*static*/ void LearnerBase::Print(const NDArrayViewPtr& value, const char* msg)
{
switch (value->GetDataType())
{
case DataType::Float:
value->GetMatrix<float>()->Print(msg);
break;
case DataType::Double:
value->GetMatrix<double>()->Print(msg);
break;
default:
LogicError("Unsupported DataType %s", DataTypeName(value->GetDataType()));
}
}
// Clipping gradients to prevent outliers,
template <typename ElementType>
void LearnerBase::ClipGradient(Matrix<ElementType>& gradient, size_t actualMBSize) const
{
if (m_additionalOptions.gradientClippingThresholdPerSample != numeric_limits<double>::infinity())
{
double maxGradientPerMB = m_additionalOptions.gradientClippingThresholdPerSample * actualMBSize;
if (m_additionalOptions.gradientClippingWithTruncation)
gradient.InplaceTruncate(ElementType(maxGradientPerMB));
else
{
// norm2 normalized
double gradientNorm = gradient.FrobeniusNorm();
if (gradientNorm > maxGradientPerMB)
{
double normFactor = maxGradientPerMB / gradientNorm;
gradient *= ElementType(normFactor);
}
}
}
}
// Performs additional preprocessing before calling the update method
// (gradient clipping and L2 regularization depending on the additional learning parameters).
template <typename ElementType>
void LearnerBase::PreProcess(const NDArrayViewPtr& parameterValue, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const
{
const auto& gradientMatrix = gradientValue->GetWritableMatrix<ElementType>();
// clipping gradients to prevent outliers
ClipGradient<ElementType>(*gradientMatrix, actualMBSize);
// L2 regularizer
if (m_additionalOptions.l2RegularizationWeight > 0)
{
// multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
auto weight = ElementType(m_additionalOptions.l2RegularizationWeight * actualMBSize);
const auto& parameterMatrix = parameterValue->GetWritableMatrix<ElementType>();
Matrix<ElementType>::ScaleAndAdd(weight, *parameterMatrix, *gradientMatrix);
}
}
// Performs additional postprocessing after the update method has been executed
// (noise injection and L1 regularization specified by the additional learning parameters).
template <typename ElementType>
void LearnerBase::PostProcess(const Parameter& parameter, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const
{
const auto& parameterValue = parameter.Value();
const auto& parameterMatrix = parameterValue->GetWritableMatrix<ElementType>();
if (m_additionalOptions.gaussianNoiseInjectionStdDev > 0)
{
const auto& gradientMatrix = gradientValue->GetWritableMatrix<ElementType>();
Matrix<ElementType> sgdUpdateNoise((DEVICEID_TYPE)parameterMatrix->GetDeviceId());
// get the gradient structure since gradient is sparse
sgdUpdateNoise.SetValue(*gradientMatrix);
auto noiseStdDev = ElementType(m_additionalOptions.gaussianNoiseInjectionStdDev);
// reset its value to random
sgdUpdateNoise.SetGaussianRandomValue(ElementType(0.0), noiseStdDev);
Matrix<ElementType>::ScaleAndAdd(ElementType(1.0), sgdUpdateNoise, *parameterMatrix);
}
// L1 regularizer with proximal gradient descent method
if (m_additionalOptions.l1RegularizationWeight > 0)
{
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
// multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
auto weight = ElementType(learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize);
parameterValue->GetWritableMatrix<ElementType>()->InplaceSoftThreshold(weight);
}
}
template <typename ElementType>
/*static*/ TensorView<ElementType>* LearnerBase::GetWritableTensorView(const NDArrayViewPtr& arrayView)
{
return arrayView->GetWritableTensorView<ElementType>();
}
LearnerBase::LearnerBase(const unordered_set<Parameter>& parameters)
: Learner(parameters),
m_learningRatePerSample(0.0),
m_sampleCount(0)
{
const unordered_set<Parameter>& parameterSet = parameters;
for (const auto& parameter : parameterSet)
{
// TODO: using the same device to allocate data for all smoothed gradients. Is this correct?
// Should the device be specified on the per-parameter basis?
NDArrayViewPtr view;
if (parameter.GetDataType() == DataType::Float)
{
view = MakeSharedObject<NDArrayView>(0.0f, parameter.Shape(), parameter.Value()->Device());
}
else
{
view = MakeSharedObject<NDArrayView>(0.0, parameter.Shape(), parameter.Value()->Device());
}
m_smoothedGradientValues.insert(make_pair(parameter, view));
m_additionalOptions.learningRateMultipliers.insert(make_pair(parameter, 1.0));
}
}
void LearnerBase::ResetSmoothedGradients()
{
for (const auto& parameter : Parameters())
{
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
const auto& data = smoothedGradientValue;
switch (data->GetDataType())
{
case DataType::Float:
data->SetValue(0.0f);
break;
case DataType::Double:
data->SetValue(0.0);
break;
default:
LogicError("Unsupported DataType %s", ::CNTK::DataTypeName(data->GetDataType()));
}
}
}
/*virtual*/ bool LearnerBase::Update(const unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) /*override*/
{
// make sure trainingSampleCount is a valid value
assert(trainingSampleCount > 0);
for (const auto& parameter : Parameters())
{
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
const auto& gradientValue = gradientValues.at(parameter);
// TODO: make this a runtime parameter.
#if DUMPOUTPUT
LOGPRINTF(stderr, "Update_%ls\n", parameter.Name().c_str());
#endif
#ifdef _DEBUG
if (HasNan(smoothedGradientValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
LogicError("%ls has NaNs in smoothedGradient.", parameter.Name().c_str());
#endif
#if DUMPOUTPUT
LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
m_learningRatePerSample, m_momentumPerSample, trainingSampleCount);
LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n",
LearnerType().c_str(), m_GaussianNoiseInjectStd);
Print(gradientValue, "Gradient Update");
Print(smoothedGradientValue, "Smoothed Gradient Input");
#endif
UPDATE_FUNCTION;
#if DUMPOUTPUT
Print(parameterValue, "Parameter Update");
#endif
#ifdef _DEBUG
const auto& parameterValue = parameter.Value();
if (HasNan(parameterValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
LogicError("%ls has NaNs in parameter values after parameter update.", parameter.Name().c_str());
#endif
}
m_sampleCount += trainingSampleCount;
return false;
}
template <typename ElementType>
void LearnerBase::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
{
const auto& parameterValue = parameter.Value();
PreProcess<ElementType>(parameterValue, gradientValue, trainingSampleCount);
Update(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
PostProcess<ElementType>(parameter, gradientValue, trainingSampleCount);
}
string LearnerBase::LearnerType() const
{
auto name = typeid(*this).name();
if (strncmp(name, "class ", 6) == 0)
{
// On Windows, the type name contains "class" prefix.
// Return the actual name, omitting the prefix.
return &name[6];
}
return name;
}
/*virtual*/ Dictionary LearnerBase::GetCheckpointState() const /*override*/
{
NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
Dictionary checkpoint;
for (const auto& parameter : Parameters())
{
// TODO: parameter name is not guaranteed to be unique. Instead, all serializable objects
// need to expose "UId" property -- a persistent unique internal name.
// Switch to UId as soon as it's available.
if (checkpoint.Contains(parameter.Name()))
{
LogicError("Parameter names must be unique");
}
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
// Potentially, could store things like dimensions, element size, format, etc., but
// that seems to be redundant, since all of that is passed in the constructor.
checkpoint[parameter.Name()] = SerializeToVector(smoothedGradientValue);
}
return checkpoint;
}
/*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/
{
NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
for (const auto& parameter : Parameters())
{
if (!checkpoint.Contains(parameter.Name()))
{
LogicError("Checkpoint does not contain state for parameter %ls", parameter.Name().c_str());
}
const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
const DictionaryValue& state = checkpoint[parameter.Name()];
const auto& data = smoothedGradientValue;
DeserializeFromVector(data, state.GetValue<vector<DictionaryValue>>());
}
}
/*virtual*/ void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
{
UPDATE_FUNCTION;
}
template <typename ElementType>
void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
{
UNUSED(trainingSampleCount);
const auto& parameterValue = parameter.Value();
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
const auto& learningRate = ElementType(ParameterDependentLearningRate(parameter));
// TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
// (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
learningRate, ElementType(m_momentumPerSample), m_useNesterovAcceleration);
}
LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
: LearnerBase(parameters), m_needAveMultiplier(needAveMultiplier)
{
}
/*virtual*/ void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
{
UPDATE_FUNCTION;
}
template <typename ElementType>
void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
{
UNUSED(trainingSampleCount);
const auto& parameterValue = parameter.Value();
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier);
Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
}
LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Parameter>& parameters)
: LearnerMomentumSGD(parameters)
{
}
/*virtual*/ void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
{
UPDATE_FUNCTION;
}
template <typename ElementType>
void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
{
UNUSED(trainingSampleCount);
const auto& parameterValue = parameter.Value();
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
//const double momentum = MomentumPerMB(m_momentumPerSample, trainingSampleCount);
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix,
learningRate, ElementType(m_momentumPerSample));
}
LearnerRMSProp::LearnerRMSProp(const unordered_set<Parameter>& parameters,
double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
: LearnerBase(parameters),
m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
m_needAveMultiplier(needAveMultiplier)
{
}
/*virtual*/ void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
{
UPDATE_FUNCTION;
}
template <typename ElementType>
void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
{
UNUSED(trainingSampleCount);
const auto& parameterValue = parameter.Value();
const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix,
ElementType(m_gamma), ElementType(m_inc),
ElementType(m_max), ElementType(m_dec),
ElementType(m_min), m_needAveMultiplier);
Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
}
// Explicit template instantiations
template shared_ptr<Matrix<float>> LearnerBase::GetWritableMatrix<float>(const NDArrayViewPtr& arrayView);
template shared_ptr<Matrix<double>> LearnerBase::GetWritableMatrix<double>(const NDArrayViewPtr& arrayView);
LearnerPtr SGDLearner(const unordered_set<Parameter>& parameters, double learningRatePerSample)
{
return MakeSharedObject<LearnerSGD>(parameters, learningRatePerSample);
}
LearnerPtr MomentumSGDLearner(const unordered_set<Parameter>& parameters)
{
return MakeSharedObject<LearnerMomentumSGD>(parameters);
}
LearnerPtr NesterovLearner(const unordered_set<Parameter>& parameters)
{
return MakeSharedObject<LearnerNesterov>(parameters);
}
LearnerPtr AdaGradLearner(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
{
return MakeSharedObject<LearnerAdaGrad>(parameters, needAveMultiplier);
}
LearnerPtr FSAdaGradLearner(const unordered_set<Parameter>& parameters)
{
return MakeSharedObject<LearnerFSAdaGrad>(parameters);
}
LearnerPtr RMSPropLearner(const unordered_set<Parameter>& parameters,
double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
{
return MakeSharedObject<LearnerRMSProp>(parameters, gamma, inc, dec, max, min, needAveMultiplier);
}
}

Просмотреть файл

@ -0,0 +1,201 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#include "CNTKLibrary.h"
#include <numeric>
namespace CNTK
{
// A collection of additional options that are applicable for all standard learners
// (after these options are set, they retain their value for the entire lifespan of a learner).
struct AdditionalLearningOptions
{
double l1RegularizationWeight = 0.0;
double l2RegularizationWeight = 0.0;
double gaussianNoiseInjectionStdDev = 0.0;
bool gradientClippingWithTruncation = true;
double gradientClippingThresholdPerSample = std::numeric_limits<double>::infinity();
std::unordered_map<Parameter, double> learningRateMultipliers;
};
// An abstract base class at the root of the standard learners hierarchy
// It implements most of the learner functionality, except for the actual update function,
// and adds a few pre-/postprocessing methods (which are invoked before and after the update).
class LearnerBase : public Learner
{
public:
virtual bool Update(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) override final;
virtual Dictionary GetCheckpointState() const override final;
virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override final;
void SetAdditionalOptions(const AdditionalLearningOptions& additionalOptions)
{
m_additionalOptions = additionalOptions;
}
// TODO: should this be called ResetMomentum?
// needed for BlockMomemtumSGD to reset SGD momentum after aggregation.
void ResetSmoothedGradients();
// TODO: move learning rate and momentum scheduling and adjustment functionality
// inside the learner and drop these setters.
void SetLearningRate(double value) { m_learningRatePerSample = value; }
protected:
LearnerBase(const std::unordered_set<Parameter>& parameters);
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const = 0;
double ParameterDependentLearningRate(const Parameter& parameter) const
{
return m_learningRatePerSample * m_additionalOptions.learningRateMultipliers.at(parameter);
}
std::string LearnerType() const;
double m_learningRatePerSample;
AdditionalLearningOptions m_additionalOptions;
std::unordered_map<Parameter, NDArrayViewPtr> m_smoothedGradientValues;
// The following four static protected methods expose private methods of NDArrayView class
// (which declares LearnerBase as friend class), so that they are available to subclasses.
template <typename ElementType>
static std::shared_ptr<const Microsoft::MSR::CNTK::Matrix<ElementType>> GetMatrix(const NDArrayViewPtr& arrayView);
template <typename ElementType>
static std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetWritableMatrix(const NDArrayViewPtr& arrayView);
template <typename ElementType>
static const Microsoft::MSR::CNTK::TensorView<ElementType>* GetTensorView(const NDArrayViewPtr& arrayView);
template <typename ElementType>
static Microsoft::MSR::CNTK::TensorView<ElementType>* GetWritableTensorView(const NDArrayViewPtr& arrayView);
template <typename ElementType>
void ClipGradient(Microsoft::MSR::CNTK::Matrix<ElementType>& gradient, size_t actualMBSize) const;
// Performs additional preprocessing before calling the update method
// (gradient clipping and L2 regularization depending on the additional learning parameters).
template <typename ElementType>
void PreProcess(const NDArrayViewPtr& parameterValue, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const;
// Performs additional postprocessing after the update method has been executed
// (noise injection and L1 regularization specified by the additional learning parameters).
template <typename ElementType>
void PostProcess(const Parameter& parameter, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const;
private:
// Templatized update function, it invokes preprocess and postprocess using the provided
// template parameter and also invokes virtual Update method implemented in one of the subclasses.
template <typename ElementType>
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
// TODO: make these functions friends of NDViewArray and move to Utils?
static bool HasNan(const NDArrayViewPtr& value, const char* name);
static void Print(const NDArrayViewPtr& value, const char* msg);
size_t m_sampleCount;
};
// Vanilla gradient descent optimization algorithm.
class LearnerSGD : public LearnerBase
{
public:
LearnerSGD(const std::unordered_set<Parameter>& parameters, double learningRatePerSample = 0)
: LearnerBase(parameters), m_momentumPerSample(0.0), m_useNesterovAcceleration(false)
{
SetLearningRate(learningRatePerSample);
}
protected:
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
template <typename ElementType>
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
double m_momentumPerSample;
bool m_useNesterovAcceleration;
};
// SGD optimization with momentum.
class LearnerMomentumSGD : public LearnerSGD
{
public:
LearnerMomentumSGD(const std::unordered_set<Parameter>& parameters)
: LearnerSGD(parameters)
{}
void SetMomentum(double value) { m_momentumPerSample = value; }
};
// Nesterov's accelerated SGDLearnerBase descent.
class LearnerNesterov : public LearnerSGD
{
public:
LearnerNesterov(const std::unordered_set<Parameter>& parameters)
: LearnerSGD(parameters)
{
m_useNesterovAcceleration = true;
}
};
class LearnerAdaGrad : public LearnerBase
{
public:
LearnerAdaGrad(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier);
protected:
bool m_needAveMultiplier;
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
template <typename ElementType>
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
};
class LearnerFSAdaGrad : public LearnerMomentumSGD
{
public:
LearnerFSAdaGrad(const std::unordered_set<Parameter>& parameters);
protected:
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
template <typename ElementType>
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
};
class LearnerRMSProp : public LearnerBase
{
public:
LearnerRMSProp(const std::unordered_set<Parameter>& parameters,
double gamma, double inc, double dec, double max, double min, bool needAveMultiplier);
protected:
double m_gamma;
double m_inc;
double m_dec;
double m_max;
double m_min;
bool m_needAveMultiplier;
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
template <typename ElementType>
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
};
}

Просмотреть файл

@ -0,0 +1,246 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#include "CNTKLibrary.h"
#include "Utils.h"
#include "Config.h"
#include "MinibatchSource.h"
#include "HeapMemoryProvider.h"
#include "ReaderShim.h"
#include "Function.h"
#include <tuple>
#include "ComputationNetworkBuilder.h"
using namespace Microsoft::MSR::CNTK;
namespace CNTK
{
MinibatchSourcePtr CreateCompositeMinibatchSource(const Dictionary& configuration)
{
return MinibatchSourcePtr(new CompositeMinibatchSource(configuration));
}
CompositeMinibatchSource::CompositeMinibatchSource(const Dictionary& configuration)
: m_epochEndReached(false), m_prevMinibatchSize(0), m_epochSize(SIZE_MAX)
{
ConfigParameters config;
std::wstringstream s;
for (const auto& keyValuePair : *(configuration.m_dictionaryData))
AddConfigString(s, keyValuePair.first, keyValuePair.second, 0);
config.Parse(msra::strfun::utf8(s.str()));
const wchar_t* epochSizeConfigurationKey = L"epochSize";
if (configuration.Contains(epochSizeConfigurationKey))
m_epochSize = configuration[epochSizeConfigurationKey].GetValue<size_t>();
if (m_epochSize == 0)
m_epochSize = Microsoft::MSR::CNTK::requestDataSize;
typedef Reader*(*CreateCompositeDataReaderProc)(const ConfigParameters* parameters);
CreateCompositeDataReaderProc createReaderProc = (CreateCompositeDataReaderProc)Plugin().Load(L"CompositeDataReader", "CreateCompositeDataReader");
m_compositeDataReader.reset(createReaderProc(&config));
auto compositeDataReaderStreamDescs = m_compositeDataReader->GetStreamDescriptions();
for (auto streamDesc : compositeDataReaderStreamDescs)
m_streamInfos.insert({ streamDesc->m_name, streamDesc->m_id, AsStorageFormat(streamDesc->m_storageType), AsDataType(streamDesc->m_elementType), AsNDShape(*(streamDesc->m_sampleLayout)) });
}
/*virtual*/ std::unordered_map<StreamInfo, MinibatchData> CompositeMinibatchSource::GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/) /*override*/
{
std::unordered_map<StreamInfo, MinibatchData> minibatchData;
if (!m_epochEndReached)
{
// TODO: Support different minibatch sizes for different streams
size_t requestedMinibatchSizeInSamples = 0;
for (const auto& val : perStreamMBSizeLimits)
{
size_t maxNumSequencesRequested = val.second.first;
size_t maxNumSamplesRequested = val.second.second;
// TODO: Specifying minibatch size in #sequences is currently unsupported
if (maxNumSequencesRequested != 0)
LogicError("Specifying minibatch size in #sequences is currently unsupported");
if (requestedMinibatchSizeInSamples == 0)
requestedMinibatchSizeInSamples = maxNumSamplesRequested;
else
{
if (requestedMinibatchSizeInSamples != maxNumSamplesRequested)
LogicError("Different minibatch sizes across different input streams is currently unsupported!");
}
}
if (requestedMinibatchSizeInSamples == 0)
InvalidArgument("GetNextMinibatch: Requested minibatch sizes must be > 0");
if (m_prevMinibatchSize == 0)
{
// TODO: Add support for distributed reading
EpochConfiguration epochConfig = { 1, 0, requestedMinibatchSizeInSamples, m_epochSize, 0, 0 };
m_compositeDataReader->StartEpoch(epochConfig);
m_prevMinibatchSize = requestedMinibatchSizeInSamples;
}
if (requestedMinibatchSizeInSamples != m_prevMinibatchSize)
LogicError("GetNextMinibatch: Changing minibatch sizes across calls is currently unsupported");
auto compositeReaderMinibatchData = m_compositeDataReader->ReadMinibatch();
m_epochEndReached = compositeReaderMinibatchData.m_endOfEpoch;
auto compositeDataReaderStreamDescs = m_compositeDataReader->GetStreamDescriptions();
size_t numStreams = compositeDataReaderStreamDescs.size();
for (size_t i = 0; i < numStreams; ++i)
{
auto currentStreamDesc = compositeDataReaderStreamDescs[i];
auto iter = std::find_if(perStreamMBSizeLimits.begin(), perStreamMBSizeLimits.end(), [currentStreamDesc](const std::pair<StreamInfo, std::pair<size_t, size_t>>& entry) {
return entry.first.m_id == currentStreamDesc->m_id;
});
if (iter == perStreamMBSizeLimits.end())
continue;
auto& currentStreamInfo = iter->first;
auto sampleShape = AsNDShape(*(currentStreamDesc->m_sampleLayout));
ValuePtr minibatchValuePtr;
if (compositeReaderMinibatchData.m_data.empty())
{
minibatchValuePtr = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(currentStreamInfo.m_elementType, sampleShape.AppendShape({ 0, 0 }), DeviceDescriptor::CPUDevice()));
continue;
}
auto currentStreamMinibatchData = compositeReaderMinibatchData.m_data[i];
if (currentStreamDesc->m_elementType == ElementType::tfloat)
{
auto dataMatrix = std::make_shared<Matrix<float>>(CPUDEVICE);
size_t sampleSize = currentStreamDesc->m_sampleLayout->GetNumElements();
// TODO: Eliminate the unnecessary CPU to CPU copy
ReaderShim<float>::FillMatrixFromStream(currentStreamDesc->m_storageType, dataMatrix.get(), sampleSize, currentStreamMinibatchData);
minibatchValuePtr = CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(sampleShape, *dataMatrix, currentStreamMinibatchData->m_layout, false);
size_t numSamples = currentStreamMinibatchData->m_layout->GetActualNumSamples();
size_t numSequences = currentStreamMinibatchData->m_layout->GetNumSequences();
minibatchData[currentStreamInfo] = { numSequences, numSamples, minibatchValuePtr };
}
else
LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
}
}
return minibatchData;
}
void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndInvStdDevs,
const DeviceDescriptor& device /*= DeviceDescriptor::CPUDevice()*/)
{
typedef std::shared_ptr<ComputationNode<float>> ComputationNodePtr;
const auto& minibatchSourceStreams = minibatchSource->StreamInfos();
auto computationNetwork = std::make_shared<ComputationNetwork>(AsCNTKImplDeviceId(device));
ComputationNetworkBuilder<float> builder(*computationNetwork);
std::vector<ComputationNodeBasePtr> allInputNodes;
std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToInputNodeMap;
std::unordered_map<StreamInfo, Variable> streamToDummyInputVariableMap;
std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToMeanNodeMap;
std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToInvStdDevNodeMap;
size_t totalSizePerSample = 0;
for (auto& currentStreamKV : computedMeanAndInvStdDevs)
{
auto currentStreamInfo = currentStreamKV.first;
if (minibatchSourceStreams.find(currentStreamInfo) == minibatchSourceStreams.end())
InvalidArgument("ComputeMeanAndVariance: Stream for which mean and variance is to be computed is not supported by the specified minibatchSource");
if (currentStreamInfo.m_elementType != DataType::Float)
LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
auto inputVariableShape = currentStreamInfo.m_sampleLayout;
auto inputTensorShape = AsTensorShape(inputVariableShape);
totalSizePerSample += (inputVariableShape.TotalSize() * sizeof(float));
ComputationNodePtr inputNode;
Variable inputVariable;
if (currentStreamInfo.m_storageFormat != StorageFormat::Dense)
{
inputNode = builder.CreateSparseInputNode(currentStreamInfo.m_name, inputTensorShape);
inputVariable = Variable(inputVariableShape, true, DataType::Float, currentStreamInfo.m_name);
}
else
{
inputNode = builder.CreateInputNode(currentStreamInfo.m_name, inputTensorShape);
inputVariable = Variable(inputVariableShape, DataType::Float, currentStreamInfo.m_name);
}
allInputNodes.push_back(inputNode);
streamToInputNodeMap[currentStreamInfo] = inputNode;
streamToDummyInputVariableMap[currentStreamInfo] = inputVariable;
streamToMeanNodeMap[currentStreamInfo] = builder.Mean(inputNode);
streamToInvStdDevNodeMap[currentStreamInfo] = builder.InvStdDev(inputNode);
}
computationNetwork->CompileNetwork();
computationNetwork->AllocateAllMatrices(computationNetwork->RootNodes(), {}, nullptr);
ScopedNetworkOperationMode modeGuard(computationNetwork, NetworkOperationMode::preComputing);
// initialize
auto preComputeNodes = computationNetwork->GetNodesRequiringPreComputation();
for (auto & preComputeNode : preComputeNodes)
dynamic_pointer_cast<IPreComputeNode>(preComputeNode)->MarkComputed(false /*begin accumulating*/);
const size_t maxMinibatchDataSize = (1 << 27); // 128 MB
const size_t minibatchSize = maxMinibatchDataSize / totalSizePerSample;
std::unordered_map<StreamInfo, std::pair<size_t, size_t>> minibatchSizeLimits;
for (auto& currentStreamKV : computedMeanAndInvStdDevs)
minibatchSizeLimits.insert(std::make_pair(currentStreamKV.first, std::make_pair((size_t)0, minibatchSize)));
for (;;)
{
auto minibatchData = minibatchSource->GetNextMinibatch(minibatchSizeLimits, device);
if (minibatchData.empty())
break;
for (auto& currentStreamKV : computedMeanAndInvStdDevs)
CompositeFunction::PopulateComputationNodeValue<float>({ streamToDummyInputVariableMap[currentStreamKV.first], minibatchData[currentStreamKV.first].m_data }, streamToInputNodeMap[currentStreamKV.first]);
ComputationNetwork::BumpEvalTimeStamp(allInputNodes);
computationNetwork->ForwardProp(preComputeNodes);
}
// finalize
for (auto & preComputeNode : preComputeNodes)
dynamic_pointer_cast<IPreComputeNode>(preComputeNode)->MarkComputed(true /*done accumulating*/);
// Copy out the results
for (auto& currentStreamKV : computedMeanAndInvStdDevs)
{
ValuePtr mean, invStdDev;
if (computedMeanAndInvStdDevs[currentStreamKV.first].first != nullptr)
mean = MakeSharedObject<Value>(computedMeanAndInvStdDevs[currentStreamKV.first].first);
if (computedMeanAndInvStdDevs[currentStreamKV.first].second != nullptr)
invStdDev = MakeSharedObject<Value>(computedMeanAndInvStdDevs[currentStreamKV.first].second);
CompositeFunction::GetNodeOutputOrGradient(streamToDummyInputVariableMap[currentStreamKV.first], mean, streamToMeanNodeMap[currentStreamKV.first], false /*getGradient*/);
CompositeFunction::GetNodeOutputOrGradient(streamToDummyInputVariableMap[currentStreamKV.first], invStdDev, streamToInvStdDevNodeMap[currentStreamKV.first], false /*getGradient*/);
if (computedMeanAndInvStdDevs[currentStreamKV.first].first == nullptr)
computedMeanAndInvStdDevs[currentStreamKV.first].first = mean->Data();
if (computedMeanAndInvStdDevs[currentStreamKV.first].second == nullptr)
computedMeanAndInvStdDevs[currentStreamKV.first].second = invStdDev->Data();
}
}
}

Просмотреть файл

@ -0,0 +1,32 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "stdafx.h"
#include "CNTKLibrary.h"
#include "Utils.h"
#include "Reader.h"
namespace CNTK
{
class CompositeMinibatchSource final : public MinibatchSource
{
public:
CompositeMinibatchSource(const Dictionary& configuration);
virtual const std::unordered_set<StreamInfo>& StreamInfos() override { return m_streamInfos; }
virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) override;
private:
std::unordered_set<StreamInfo> m_streamInfos;
std::shared_ptr<Microsoft::MSR::CNTK::Reader> m_compositeDataReader;
bool m_epochEndReached;
size_t m_prevMinibatchSize;
size_t m_epochSize;
};
}

Просмотреть файл

@ -316,7 +316,17 @@ namespace CNTK
}
template <typename ElementType>
NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
/*static*/ NDArrayViewPtr NDArrayView::RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/)
{
auto matrixDims = GetMatrixDimensions(shape);
auto randomNormalMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomGaussian(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)mean, (ElementType)stdDev, seed));
auto tensorView = new TensorView<ElementType>(randomNormalMatrix, AsTensorShape(shape));
return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
}
template <typename ElementType>
/*static*/ NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
{
auto matrixDims = GetMatrixDimensions(shape);
auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeBegin, (ElementType)rangeEnd, seed));
@ -329,6 +339,9 @@ namespace CNTK
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<double>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
template CNTK_API const float* NDArrayView::DataBuffer<float>() const;
template CNTK_API const double* NDArrayView::DataBuffer<double>() const;
@ -338,8 +351,10 @@ namespace CNTK
template std::shared_ptr<const Matrix<float>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
template std::shared_ptr<const Matrix<double>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix<float>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix<double>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
template TensorView<float>* NDArrayView::GetWritableTensorView<float>();
template TensorView<double>* NDArrayView::GetWritableTensorView<double>();
template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);

Просмотреть файл

@ -81,6 +81,24 @@ namespace CNTK
GetMatrix()->SetValue(1);
}
size_t NDMask::MaskedCount() const
{
auto maskMatrix = GetMatrix();
std::unique_ptr<char[]> maskData(maskMatrix->CopyToArray());
return std::count_if(maskData.get(), maskData.get() + maskMatrix->GetNumElements(), [](const char& val) {
return val == 0;
});
}
// TODO: This could actually be strided?
const char* NDMask::DataBuffer() const
{
// First make sure that the underlying matrix is on the right device
auto matrix = GetMatrix();
matrix->TransferToDeviceIfNotThere(AsCNTKImplDeviceId(m_device), true);
return matrix->Data();
}
Matrix<char>* NDMask::GetMatrix() const
{
return m_matrixView.get();

Просмотреть файл

@ -0,0 +1,78 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#include "CNTKLibrary.h"
#include "Utils.h"
namespace CNTK
{
Trainer::Trainer(const FunctionPtr& model, const Variable& trainingLoss, const std::unordered_set<LearnerPtr>& parameterLearners)
: m_model(model), m_trainingLossVar(trainingLoss), m_parameterLearners(parameterLearners)
{
auto modelParameters = model->Parameters();
std::unordered_set<Parameter> learnerParameters;
for (const auto& learner : parameterLearners)
{
const auto& currentLearnerParameters = learner->Parameters();
for (const auto& parameter : currentLearnerParameters)
{
auto insertRetVal = learnerParameters.insert(parameter);
if (!insertRetVal.second)
InvalidArgument("Trainer::Trainer: Parameter named %S is covered by 2 different learners", parameter.Name().c_str());
}
}
if (modelParameters != learnerParameters)
InvalidArgument("Trainer::Trainer: Union of the parameters covered by the specified parameterLearnes should match the specified model's parameters");
}
bool Trainer::TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/)
{
std::unordered_map<Variable, ValuePtr> outputs = { { m_trainingLossVar, nullptr } };
auto backPropSate = m_model->Forward(arguments, outputs, computeDevice, { m_trainingLossVar });
m_prevMinibatchTrainingLossValue = outputs.begin()->second;
ValuePtr rootGradientValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(m_trainingLossVar.GetDataType(), outputs.at(m_trainingLossVar)->Data()->Shape(), computeDevice), outputs.at(m_trainingLossVar)->Mask());
if (m_trainingLossVar.GetDataType() == DataType::Float)
rootGradientValue->Data()->SetValue(1.0f);
else
rootGradientValue->Data()->SetValue(1.0);
auto modelParameters = m_model->Parameters();
std::unordered_map<Variable, ValuePtr> parameterGradients;
for (const auto& parameter : modelParameters)
parameterGradients[parameter] = nullptr;
m_model->Backward(backPropSate, { { m_trainingLossVar, rootGradientValue } }, parameterGradients);
bool anyUpdatesPerformed = false;
for (auto learner : m_parameterLearners)
{
std::unordered_map<Parameter, NDArrayViewPtr> learnerParameterGradients;
const auto& learnerParameters = learner->Parameters();
for (const auto& parameter : learnerParameters)
{
learnerParameterGradients[parameter] = parameterGradients[parameter]->Data();
if (parameterGradients[parameter]->Mask())
LogicError("The gradient value for a Parameter cannot have an associated mask!");
}
auto trainingLossArguments = m_trainingLossVar.Owner()->Arguments();
auto labelsVar = *(std::find_if(trainingLossArguments.begin(), trainingLossArguments.end(), [](const Variable& var) {
return var.IsInput();
}));
auto argumentValue = arguments.at(labelsVar);
auto argumentData = argumentValue->Data();
auto argumentDataShape = argumentData->Shape();
auto mask = argumentValue->Mask();
size_t numSamples = argumentDataShape[argumentDataShape.NumAxes() - 1] - ((mask != nullptr) ? mask->MaskedCount() : 0);
anyUpdatesPerformed |= learner->Update(learnerParameterGradients, numSamples);
}
return anyUpdatesPerformed;
}
}

Просмотреть файл

@ -6,31 +6,162 @@
#include "stdafx.h"
#include "CNTKLibrary.h"
#include "Utils.h"
#include "File.h"
using namespace std;
namespace CNTK
{
template <typename T>
void DictionaryValue::AllocateDataPtr(const T& value)
{
static_assert(is_same<T, NDShape>::value ||
is_same<T, wstring>::value ||
is_same<T, vector<DictionaryValue>>::value ||
is_same<T, Dictionary>::value, "AllocateDataPtr called with invalid type");
m_data.m_ptr = new T(value);
}
template <typename T>
void DictionaryValue::FreePtrAsType()
{
T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
delete typedPtr;
m_data.m_ptr = nullptr;
}
Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us)
{
size_t version;
stream >> version;
stream >> us.m_valueType;
switch (us.ValueType())
{
case DictionaryValue::Type::Bool:
stream >> us.m_data.m_boolean;
break;
case DictionaryValue::Type::SizeT:
stream >> us.m_data.m_sizeT;
break;
case DictionaryValue::Type::Float:
stream >> us.m_data.m_float;
break;
case DictionaryValue::Type::Double:
stream >> us.m_data.m_double;
break;
case DictionaryValue::Type::NDShape:
{
size_t size;
stream >> size;
vector<size_t> dims(size);
for (auto i = 0; i < size; i++)
{
stream >> dims[i];
}
us.AllocateDataPtr(NDShape(dims));
break;
}
case DictionaryValue::Type::Vector:
{
size_t size;
stream >> size;
vector<DictionaryValue> values(size);
for (auto i = 0; i < size; i++)
{
stream >> values[i];
}
us.AllocateDataPtr(values);
break;
}
default:
NOT_IMPLEMENTED;
}
return stream;
}
Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us)
{
stream << us.version;
stream << us.ValueType();
switch (us.ValueType())
{
case DictionaryValue::Type::Bool:
stream << us.m_data.m_boolean;
break;
case DictionaryValue::Type::SizeT:
stream << us.m_data.m_sizeT;
break;
case DictionaryValue::Type::Float:
stream << us.m_data.m_float;
break;
case DictionaryValue::Type::Double:
stream << us.m_data.m_double;
break;
case DictionaryValue::Type::NDShape:
{
NDShape* shapePtr = reinterpret_cast<NDShape*>(us.m_data.m_ptr);
auto size = shapePtr->NumAxes();
stream << size;
for (auto i = 0; i < size; i++)
{
stream << shapePtr->operator[](i);
}
break;
}
case DictionaryValue::Type::Vector:
{
vector<DictionaryValue>* vectorPtr =
reinterpret_cast<vector<DictionaryValue>*>(us.m_data.m_ptr);
auto size = vectorPtr->size();
stream << size;
for (auto i = 0; i < size; i++)
{
stream << vectorPtr->operator[](i);
}
break;
}
default:
NOT_IMPLEMENTED;
}
return stream;
}
Dictionary::Dictionary()
: m_dictionaryData(new std::unordered_map < std::wstring, DictionaryValue>)
: m_dictionaryData(new unordered_map <wstring, DictionaryValue>)
{
}
Dictionary::~Dictionary()
{
delete m_dictionaryData;
}
Dictionary::Dictionary(const Dictionary& other)
{
*this = other;
}
Dictionary& Dictionary::operator=(const Dictionary& other)
{
assert(this != &other);
m_dictionaryData.reset(new std::unordered_map<std::wstring, DictionaryValue>(*(other.m_dictionaryData)));
return *this;
}
Dictionary::Dictionary(Dictionary&& other)
: m_dictionaryData(nullptr)
{
*this = std::move(other);
*this = move(other);
}
Dictionary& Dictionary::operator=(Dictionary&& other)
{
assert(this != &other);
delete m_dictionaryData;
m_dictionaryData = other.m_dictionaryData;
other.m_dictionaryData = nullptr;
@ -51,4 +182,137 @@ namespace CNTK
{
return (m_dictionaryData->find(key) != m_dictionaryData->end());
}
Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us)
{
stream << us.version;
stream << us.m_dictionaryData->size();
for (auto it = us.m_dictionaryData->begin(); it != us.m_dictionaryData->end(); ++it)
{
stream << it->first;
stream << it->second;
}
return stream;
}
Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us)
{
size_t version;
stream >> version;
size_t size;
stream >> size;
us.m_dictionaryData->reserve(size);
for (auto i = 0; i < size; i++)
{
wstring key;
stream >> key;
DictionaryValue value;
stream >> value;
us.m_dictionaryData->insert(make_pair(key, value));
}
return stream;
}
template <typename T>
vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
{
if (viewPtr->IsSparse())
{
LogicError("Sparse NDArrayView cannot be serialized into a vector.");
}
auto numElements = viewPtr->Shape().TotalSize();
vector<DictionaryValue> values(numElements);
NDArrayViewPtr cpuDataViewPtr = viewPtr;
if ((viewPtr->Device().Type() != DeviceKind::CPU))
{
cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
cpuDataViewPtr->CopyFrom(*viewPtr);
}
const T* buffer = cpuDataViewPtr->DataBuffer<T>();
for (auto i = 0; i < numElements; ++i)
{
T v = buffer[i];
values[i] = DictionaryValue(v);
}
return values;
}
template <typename T>
void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
{
if (viewPtr->IsSparse())
{
LogicError("Sparse NDArrayView cannot be deserialized from a vector.");
}
auto numElements = viewPtr->Shape().TotalSize();
if (values.size() != numElements)
{
LogicError("Number of elements (%lu) in the deserialized representation does not match the expected value (%lu)",
values.size(), numElements);
}
NDArrayViewPtr cpuDataViewPtr = viewPtr;
if ((viewPtr->Device().Type() != DeviceKind::CPU))
{
cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
}
T* buffer = cpuDataViewPtr->WritableDataBuffer<T>();
for (auto i = 0; i < numElements; ++i)
{
buffer[i] = values[i].GetValue<T>();
}
if ((viewPtr->Device().Type() != DeviceKind::CPU))
{
viewPtr->CopyFrom(*cpuDataViewPtr);
}
}
// TODO: we store the type info for every element in the vector, which is extremely redundant.
// Instead, it'd be nice to introduce some sort of DictionaryValueVector.
vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
{
switch (viewPtr->GetDataType())
{
case DataType::Float:
return SerializeToVector<float>(viewPtr);
case DataType::Double:
return SerializeToVector<double>(viewPtr);
default:
LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
}
}
void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
{
switch (viewPtr->GetDataType())
{
case DataType::Float:
DeserializeFromVector<float>(viewPtr, values);
break;
case DataType::Double:
DeserializeFromVector<double>(viewPtr, values);
break;
default:
LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
}
}
template void DictionaryValue::AllocateDataPtr<NDShape>(const NDShape& value);
template void DictionaryValue::AllocateDataPtr<vector<DictionaryValue>>(const vector<DictionaryValue>& value);
template void DictionaryValue::AllocateDataPtr<wstring>(const wstring& value);
template void DictionaryValue::AllocateDataPtr<Dictionary>(const Dictionary& value);
template void DictionaryValue::FreePtrAsType<NDShape>();
template void DictionaryValue::FreePtrAsType<vector<DictionaryValue>>();
template void DictionaryValue::FreePtrAsType<wstring>();
template void DictionaryValue::FreePtrAsType<Dictionary>();
}

Просмотреть файл

@ -9,251 +9,15 @@
#include "CommonMatrix.h"
#include "TensorShape.h"
#include <string>
#include "Config.h"
#include "Reader.h"
#include "ConvolutionEngine.h"
namespace CNTK
{
// Forward declarations
class Dictionary;
class DictionaryValue
{
public:
enum class Type : unsigned int
{
None,
Bool,
SizeT,
Double,
NDShape,
Vector
};
static const char* TypeName(Type type)
{
if (type == Type::None)
return "None";
else if (type == Type::Bool)
return "Bool";
else if (type == Type::SizeT)
return "SizeT";
else if (type == Type::Double)
return "Double";
else if (type == Type::NDShape)
return "NDShape";
else if (type == Type::Vector)
return "Vector";
else
LogicError("Unknown DictionaryValue::Type");
}
public:
DictionaryValue()
: m_valueType(Type::None)
{
}
DictionaryValue(bool value)
: m_valueType(GetValueType<bool>())
{
m_data.m_boolean = value;
}
DictionaryValue(size_t value)
: m_valueType(GetValueType<size_t>())
{
m_data.m_sizeT = value;
}
DictionaryValue(double value)
: m_valueType(GetValueType<double>())
{
m_data.m_double = value;
}
template <typename T>
DictionaryValue(const T& value)
: m_valueType(GetValueType<T>())
{
static_assert(std::is_same<T, NDShape>::value ||
std::is_same<T, std::vector<DictionaryValue>>::value,
"Unsupported ValueType");
AllocateDataPtr(value);
}
DictionaryValue(const DictionaryValue& other)
: m_valueType(Type::Bool)
{
// The m_valueType must hvae been set to a non-ptr type to prevent an attempt to interpret
// the underlying underlying uninitialized value as a ptr and free it.
*this = other;
}
DictionaryValue& operator=(const DictionaryValue& other)
{
if (this != &other)
{
FreeDataPtr();
m_valueType = other.m_valueType;
m_data = other.m_data;
if (other.m_valueType == Type::NDShape)
AllocateDataPtr(other.GetValue<NDShape>());
else if (other.m_valueType == Type::Vector)
AllocateDataPtr(other.GetValue<std::vector<DictionaryValue>>());
}
return *this;
}
~DictionaryValue()
{
FreeDataPtr();
}
template <typename T, typename std::enable_if<std::is_same<T, bool>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return m_data.m_boolean;
}
template <typename T, typename std::enable_if<std::is_same<T, size_t>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return m_data.m_sizeT;
}
template <typename T, typename std::enable_if<std::is_same<T, double>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return m_data.m_double;
}
template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value || std::is_same<T, std::vector<DictionaryValue>>::value>::type* = nullptr>
const T& GetValue() const
{
VerifyType<T>();
return *(reinterpret_cast<T*>(m_data.m_ptr));
}
bool HasValue() const
{
return m_valueType != Type::None;
}
Type ValueType() const
{
return m_valueType;
}
private:
template <typename T>
static Type GetValueType()
{
static_assert(std::is_same<T, bool>::value ||
std::is_same<T, size_t>::value ||
std::is_same<T, double>::value ||
std::is_same<T, NDShape>::value ||
std::is_same<T, std::vector<DictionaryValue>>::value ||
std::is_same<T, CNTK::Dictionary>::value,
"Unsupported ValueType");
if (std::is_same<T, bool>::value)
return Type::Bool;
else if (std::is_same<T, size_t>::value)
return Type::SizeT;
else if (std::is_same<T, double>::value)
return Type::Double;
else if (std::is_same<T, NDShape>::value)
return Type::NDShape;
else if (std::is_same<T, std::vector<DictionaryValue>>::value)
return Type::Vector;
}
template <typename T>
void VerifyType() const
{
if (GetValueType<T>() != m_valueType)
RuntimeError("Reading a DictionaryValue as the wrong type; Reading as type %s when actual type is %s", typeid(T).name(), DictionaryValue::TypeName(m_valueType));
}
template <typename T>
void AllocateDataPtr(const T& value)
{
static_assert(std::is_same<T, NDShape>::value || std::is_same<T, std::vector<DictionaryValue>>::value, "AllocateDataPtr called with invalid type");
m_data.m_ptr = new T(value);
}
template <typename T>
void FreePtrAsType()
{
T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
delete typedPtr;
m_data.m_ptr = nullptr;
}
void FreeDataPtr()
{
if (m_valueType == Type::NDShape)
FreePtrAsType<NDShape>();
else if (m_valueType == Type::Vector)
FreePtrAsType<std::vector<DictionaryValue>>();
}
private:
Type m_valueType;
union ValueData
{
bool m_boolean;
size_t m_sizeT;
double m_double;
void* m_ptr;
} m_data;
};
class Dictionary
{
public:
Dictionary();
~Dictionary();
// Disallow copy contruction and assignment
Dictionary(const Dictionary&) = delete; Dictionary& operator=(const Dictionary&) = delete;
Dictionary(Dictionary&& other);
Dictionary& operator=(Dictionary&& other);
DictionaryValue& operator[](const std::wstring& key)
{
return operator[](key.c_str());
}
DictionaryValue& operator[](const wchar_t* key);
DictionaryValue operator[](const std::wstring& key) const
{
return operator[](key.c_str());
}
DictionaryValue operator[](const wchar_t* key) const;
bool Contains(const std::wstring& key) const
{
return Contains(key.c_str());
}
bool Contains(const wchar_t* key) const;
private:
std::unordered_map<std::wstring, DictionaryValue>* m_dictionaryData;
};
// Helper to get the size of an element of the specified DataType
inline size_t ElementSize(DataType dataType)
{
@ -317,14 +81,53 @@ namespace CNTK
LogicError("Unknown DataType");
}
inline Microsoft::MSR::CNTK::TensorShape AsTensorShape(const NDShape& viewShape)
inline NDShape AsNDShape(const Microsoft::MSR::CNTK::TensorShape& tensorShape)
{
// The TensorShape should be flattenable to 1D
for (size_t i = 1; i < tensorShape.GetRank(); ++i)
{
if (!tensorShape.CanFlatten(i))
InvalidArgument("AsNDShape() can only be called for TensorShapes that can be flattened to 1D");
}
return std::vector<size_t>(tensorShape.GetDims().begin(), tensorShape.GetDims().end());
}
inline DataType AsDataType(Microsoft::MSR::CNTK::ElementType readerDataType)
{
switch (readerDataType)
{
case Microsoft::MSR::CNTK::ElementType::tfloat:
return DataType::Float;
case Microsoft::MSR::CNTK::ElementType::tdouble:
return DataType::Double;
default:
LogicError("Unsupported ElementType from CNTK Reader");
}
}
inline StorageFormat AsStorageFormat(Microsoft::MSR::CNTK::StorageType readerStorageType)
{
switch (readerStorageType)
{
case Microsoft::MSR::CNTK::StorageType::dense:
return StorageFormat::Dense;
case Microsoft::MSR::CNTK::StorageType::sparse_csc:
return StorageFormat::SparseCSC;
default:
LogicError("Unsupported StorageType from CNTK Reader");
}
}
inline Microsoft::MSR::CNTK::TensorShape AsTensorShape(const NDShape& viewShape, bool preserveRank = false)
{
const size_t maxNumAxesSupportedByTensorView = 12;
if (viewShape.NumAxes() > maxNumAxesSupportedByTensorView)
LogicError("The number of requested axes exceeds the currently supported limit");
// TensorShape is required to be at least 2D
Microsoft::MSR::CNTK::SmallVector<size_t> tensorViewShape(std::max<size_t>(2, viewShape.NumAxes()));
size_t minRankSize = preserveRank ? viewShape.NumAxes() : 2;
Microsoft::MSR::CNTK::SmallVector<size_t> tensorViewShape(std::max<size_t>(minRankSize, viewShape.NumAxes()));
for (size_t i = 0; i < tensorViewShape.size(); ++i)
tensorViewShape[i] = (i < viewShape.NumAxes()) ? viewShape[i] : 1;
@ -363,4 +166,151 @@ namespace CNTK
{
return var.IsInput() && var.IsSparse();
}
std::vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr);
void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const std::vector<DictionaryValue>& values);
inline void AddIndentation(std::wstringstream& s, size_t numIndentationSpaces)
{
for (size_t i = 0; i < numIndentationSpaces; ++i)
s << L" ";
}
static const size_t perLevelIndentSize = 4;
inline void AddConfigString(std::wstringstream& s, const std::wstring& key, const DictionaryValue& value, size_t numIndentationSpaces);
inline void AddConfigString(std::wstringstream& s, const DictionaryValue& value, size_t numIndentationSpaces)
{
switch (value.ValueType())
{
case DictionaryValue::Type::Bool:
s << value.GetValue<bool>();
break;
case DictionaryValue::Type::Float:
s << value.GetValue<float>();
break;
case DictionaryValue::Type::Double:
s << value.GetValue<double>();
break;
case DictionaryValue::Type::String:
s << value.GetValue<std::wstring>();
break;
case DictionaryValue::Type::SizeT:
s << value.GetValue<size_t>();
break;
case DictionaryValue::Type::Vector:
{
const auto& valueVector = value.GetValue<std::vector<DictionaryValue>>();
s << L"(" << std::endl;
AddIndentation(s, numIndentationSpaces + perLevelIndentSize);
bool isFirst = true;
for (const auto& val : valueVector)
{
if (!isFirst)
s << L":";
else
isFirst = false;
AddConfigString(s, val, numIndentationSpaces + perLevelIndentSize);
}
AddIndentation(s, numIndentationSpaces);
s << L")";
break;
}
case DictionaryValue::Type::Dictionary:
{
const auto& valueDictionary = value.GetValue<Dictionary>();
s << L"[" << std::endl;
for (const auto& keyValuePair : *(valueDictionary.m_dictionaryData))
{
AddConfigString(s, keyValuePair.first, keyValuePair.second, numIndentationSpaces + perLevelIndentSize);
}
AddIndentation(s, numIndentationSpaces);
s << L"]";
break;
}
default:
LogicError("Unsupported DictionaryValue type");
}
}
inline void AddConfigString(std::wstringstream& s, const std::wstring& key, const DictionaryValue& value, size_t numIndentationSpaces)
{
static const size_t perLevelIndentSize = 4;
AddIndentation(s, numIndentationSpaces);
s << key << L" = ";
AddConfigString(s, value, numIndentationSpaces);
s << std::endl;
}
template <typename T>
inline std::vector<DictionaryValue> AsDictionaryValueVector(const std::vector<T>& basicElementTypeVector)
{
static_assert(std::is_same<T, bool>::value ||
std::is_same<T, size_t>::value ||
std::is_same<T, float>::value ||
std::is_same<T, double>::value, "Unsupported ValueType");
std::vector<DictionaryValue> dictionaryValueVector;
for (auto value : basicElementTypeVector)
dictionaryValueVector.push_back(value);
return dictionaryValueVector;
}
template <typename T>
inline std::vector<T> AsBasicElementTypeVector(const std::vector<DictionaryValue>& dictionaryValueVector)
{
static_assert(std::is_same<T, bool>::value ||
std::is_same<T, size_t>::value ||
std::is_same<T, float>::value ||
std::is_same<T, double>::value, "Unsupported ValueType");
std::vector<T> basicElementTypeVector;
for (auto value : dictionaryValueVector)
basicElementTypeVector.push_back(value.GetValue<T>());
return basicElementTypeVector;
}
inline PoolingType AsPoolingType(Microsoft::MSR::CNTK::PoolKind cntkPoolingKind)
{
switch (cntkPoolingKind)
{
case Microsoft::MSR::CNTK::PoolKind::Average:
return PoolingType::Average;
case Microsoft::MSR::CNTK::PoolKind::Max:
return PoolingType::Max;
default:
LogicError("Unknown pooling type");
}
}
inline Microsoft::MSR::CNTK::PoolKind AsCNTKPoolKind(PoolingType poolingType)
{
switch (poolingType)
{
case PoolingType::Average:
return Microsoft::MSR::CNTK::PoolKind::Average;
case PoolingType::Max:
return Microsoft::MSR::CNTK::PoolKind::Max;
default:
LogicError("Unknown pooling type");
}
}
inline std::pair<NDShape, NDShape> GetConvolutionOutputMapCountAndKernelShape(const NDShape& convolutionMapShape, const NDShape& operandShape)
{
auto outputMapCount = convolutionMapShape.SubShape(0, convolutionMapShape.NumAxes() - operandShape.NumAxes());
NDShape paddedOutputMapCount(operandShape.NumAxes(), 1);
for (size_t i = 0; i < outputMapCount.NumAxes(); ++i)
paddedOutputMapCount[paddedOutputMapCount.NumAxes() - 1 - i] = outputMapCount[outputMapCount.NumAxes() - 1 - i];
//for (size_t i = 0; i < outputMapCount.NumAxes(); ++i)
// paddedOutputMapCount[i] = outputMapCount[i];
NDShape kernelShape = convolutionMapShape.SubShape(outputMapCount.NumAxes());
return{ paddedOutputMapCount, kernelShape };
}
}

Просмотреть файл

@ -84,9 +84,15 @@ __declspec_noreturn static inline void ThrowFormatted(const char* format, ...)
// RuntimeError - throw a std::runtime_error with a formatted error string
#ifndef _MSC_VER // gcc __attribute__((format(printf())) does not percolate through variadic templates; so must go the macro route
#ifndef RuntimeError
#define RuntimeError ThrowFormatted<std::runtime_error>
#endif
#ifndef LogicError
#define LogicError ThrowFormatted<std::logic_error>
#endif
#ifndef InvalidArgument
#define InvalidArgument ThrowFormatted<std::invalid_argument>
#endif
#else
template <class... _Types>
__declspec_noreturn static inline void RuntimeError(const char* format, _Types&&... _Args)
@ -127,13 +133,11 @@ static inline void Warning(const string& message)
\
{ \
fprintf(stderr, "Inside File: %s Line: %d Function: %s -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__); \
LogicError("Inside File: %s Line: %d Function: %s -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__); \
LogicError("Inside File: %s Line: %d Function: %s -> Feature Not Implemented.", __FILE__, __LINE__, __FUNCTION__); \
\
}
#endif
}
}
}
}}}
#ifndef _MSC_VER
using Microsoft::MSR::CNTK::ThrowFormatted;
@ -579,6 +583,60 @@ struct nocase_compare
// random collection of stuff we needed at some place
// ----------------------------------------------------------------------------
// Array class
template <class T>
class ArrayRef
{
T* elements; // Array of type T
size_t count;
public:
ArrayRef(T* elementsIn, size_t sizeIn)
{
elements = elementsIn;
count = sizeIn;
}
// TODO: Copy Constructor
ArrayRef(const ArrayRef& other) = delete;
// TODO: Move Constructor
ArrayRef(ArrayRef&& other) = delete;
// TODO: Assignment operator
ArrayRef& operator=(const ArrayRef& rhs) = delete;
// TODO: Move assignment operator
ArrayRef& operator=(ArrayRef&& rhs) = delete;
size_t size() const { return count; }
T* data() const { return elements; }
T operator[](size_t i) const
{
if (i >= size())
LogicError("ArrayRef: index overflow");
return elements[i];
}
T& operator[](size_t i)
{
if (i >= count)
LogicError("ArrayRef: index overflow");
return elements[i];
}
const T* begin() const
{
return data();
}
const T* end() const
{
return data() + size();
}
};
// TODO: maybe change to type id of an actual thing we pass in
// TODO: is this header appropriate?
template <class C>

Просмотреть файл

@ -988,11 +988,10 @@ public:
return defaultValue;
}
ConfigValue Find(const std::string& name,
const char* defaultvalue = NULL) const
// Look up a variable through the nested hierarchy. If not found, return false, and 'result'is untouched.
bool TryFind(const std::string& name, ConfigValue& result, const char* defaultvalue = NULL) const
{
auto iter = find(name);
ConfigValue result;
// if we aren't found, or they want the default value
// TODO: What the hell is this?
@ -1002,13 +1001,15 @@ public:
if (iter == end() && m_parent != NULL)
{
result = m_parent->Find(name, defaultvalue);
return true;
}
else if (defaultvalue != NULL)
{
// no parent, so use default value
std::string fullName = m_configName + ":" + name;
result = ConfigValue(defaultvalue, fullName, this);
}
return true;
}
}
else
{
@ -1016,10 +1017,19 @@ public:
rhs = this->ResolveVariables(rhs);
std::string fullName = m_configName + ":" + name;
result = ConfigValue(rhs, fullName, this);
}
return result;
return true;
}
return false; // not found
}
// Look up a variable using TryFind() above. If not found, return empty string.
ConfigValue Find(const std::string& name, const char* defaultvalue = NULL) const
{
ConfigValue result;
TryFind(name, result, defaultvalue); // (if returns false, we return an empty ConfigValue)
return result;
}
// ResolveVariablesInSingleLine - In this method we replace all substrings of 'configLine' of the form "$varName$"
// (where varName is a variable name), with the value of the "varName" variable in config.
// We search up the config tree for the value, and we throw an error if we don't find it.
@ -1037,10 +1047,7 @@ public:
{
// ensure that this method was called on a single line (eg, no newline characters exist in 'configLine').
if (configLine.find_first_of("\n") != std::string::npos)
{
LogicError(
"\"ResolveVariablesInSingleLine\" shouldn't be called with a string containing a newline character");
}
LogicError("ResolveVariablesInSingleLine() should not be called with a string containing a newline character");
std::string newConfigLine = StripComments(configLine);
std::size_t start = newConfigLine.find_first_of(openBraceVar);
@ -1073,27 +1080,25 @@ public:
// in nested dictionaries, this is not working.
if (varName.empty())
{
RuntimeError("$$ is not allowed. Parsing of string failed: %s:%s",
RuntimeError("$$ is not allowed. Parsing of string failed: %s:%s",
m_configName.c_str(),
newConfigLine.c_str());
}
// Note that this call to "Find" can trigger further substitutions of the form $varName2$ -> varValue2,
// thus making this search process recursive.
std::string varValue = this->Find(varName);
if (varValue.empty())
ConfigValue varConfigValue;
const bool foundValue = this->TryFind(varName, varConfigValue);
if (!foundValue)
{
RuntimeError("No variable found with the name %s. Parsing of string failed: %s:%s",
RuntimeError("No variable found with the name %s. Parsing of string failed: %s:%s",
varName.c_str(), m_configName.c_str(),
newConfigLine.c_str());
}
if (varValue.find_first_of("\n") != std::string::npos)
{
LogicError(
"Newline character cannot be contained in the value of a variable which is resolved using $varName$ feature");
}
std::string varValue = varConfigValue;
if (varValue.find_first_of("\n") != std::string::npos)
LogicError("Newline characters are not allowed in the value of a variable which is resolved using $varName$ feature");
// Replace $varName$ with 'varValue'. Then continue the search for
// other variables in 'newConfigLine' string, starting at the point

Просмотреть файл

@ -282,7 +282,7 @@ class VariableSchema : public std::vector<VariableLayout>
Values<ElemType> CreateBuffers(const std::vector<size_t>& maxLengths)
{
if (maxLengths.size() != size())
throw std::exception("Expected max lengths for all variables.");
throw std::runtime_error("Expected max lengths for all variables.");
Values<ElemType> buffers(size());
for (size_t i = 0; i < size(); ++i)

Просмотреть файл

@ -134,4 +134,5 @@ public:
return randomizationrange == randomizeDisable;
}
};
} } }
}}}

Просмотреть файл

@ -29,7 +29,8 @@ public:
runtime_error(msg)
{
}
virtual void PrintError(const std::wstring& linePrefix) const = 0;
virtual std::wstring GetError(const std::wstring& /*linePrefix*/) const = 0;
virtual void PrintError(const std::wstring& /*linePrefix*/) const = 0;
};
// -----------------------------------------------------------------------
@ -619,9 +620,9 @@ public:
{
}
// ConfigArray(ConfigValuePtr && val) : firstIndex(0), values(std::vector<ConfigValuePtr>{ move(val) }) { }
pair<int, int> GetIndexRange() const
pair<int, int> GetIndexBeginEnd() const
{
return make_pair(firstIndex, firstIndex + (int) values.size() - 1);
return make_pair(firstIndex, firstIndex + (int)values.size());
}
// for use as a plain array: get size and verify that index range starts with 0
template <typename FAILFN>

Просмотреть файл

@ -411,7 +411,7 @@ static inline void byteswap(V &v) throw()
// execute a block with retry
// Block must be restartable.
// Use this when writing small files to those unreliable Windows servers.
// Use this when writing/reading small files to those unreliable Windows servers.
// TODO: This will fail to compile under VS 2008--we need an #ifdef around this
template <typename FUNCTION>
static void attempt(int retries, const FUNCTION &body)

Просмотреть файл

@ -592,7 +592,8 @@ void fgetfile(const std::wstring& pathname, std::vector<char>& buffer);
void fgetfile(FILE* f, std::vector<char>& buffer);
namespace msra { namespace files {
void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines);
void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines, int numberOfTries = 1);
static inline std::vector<std::string> fgetfilelines(const std::wstring& pathname)
{
std::vector<char> buffer;
@ -600,7 +601,7 @@ static inline std::vector<std::string> fgetfilelines(const std::wstring& pathnam
fgetfilelines(pathname, buffer, lines);
return lines;
}
std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer);
std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, int numberOfTries = 1);
}}

Просмотреть файл

@ -1251,7 +1251,7 @@ public:
// BUGBUG: we only really support one archive file at this point
// read the TOC in one swoop
std::vector<char> textbuffer;
auto toclines = msra::files::fgetfilelines(tocpath, textbuffer);
auto toclines = msra::files::fgetfilelines(tocpath, textbuffer, 3);
// parse it one by one
size_t archiveindex = SIZE_MAX; // its index

Просмотреть файл

@ -16,6 +16,7 @@
#endif
#define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1
#include "Basics.h"
#include "basetypes.h" //for attemp()
#include "fileutil.h"
#include "ProgressTracing.h"
@ -1632,6 +1633,11 @@ static size_t fgetfilechars(const std::wstring& path, vector<char>& buffer)
return len;
}
static void fgetfilechars(const std::wstring& path, vector<char>& buffer, size_t& len)
{
len = fgetfilechars(path, buffer);
}
template <class LINES>
static void strtoklines(char* s, LINES& lines)
{
@ -1639,10 +1645,14 @@ static void strtoklines(char* s, LINES& lines)
lines.push_back(p);
}
void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines)
void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines, int numberOfTries)
{
// load it into RAM in one huge chunk
const size_t len = fgetfilechars(path, buffer);
size_t len = 0;
msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
{
// load it into RAM in one huge chunk
fgetfilechars(path, buffer, len);
});
// parse into lines
lines.resize(0);
@ -1651,11 +1661,15 @@ void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer,
}
// same as above but returning const char* (avoiding the memory allocation)
vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer)
vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer, int numberOfTries)
{
// load it into RAM in one huge chunk
const size_t len = fgetfilechars(path, buffer);
size_t len = 0;
msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
{
// load it into RAM in one huge chunk
fgetfilechars(path, buffer, len);
});
// parse into lines
vector<char*> lines;
lines.reserve(len / 20);

Просмотреть файл

@ -18,6 +18,7 @@
#include "PreComputeNodes.h"
#include "EvaluationNodes.h"
#include "SpecialPurposeNodes.h"
#include "DeprecatedNodes.h" // (for SaveToDbnFile(), which is also deprecated)
#include "MPIWrapper.h" // TODO: does not belong here
#include <string>
#include <vector>
@ -391,13 +392,38 @@ void ComputationNetwork::Read(const wstring& fileName)
// node construction
// -----------------------------------------------------------------------
// non-static version needed because it accesses m_randomSeedOffset
// Excessively used by SimpleNetworkBuilder, but always after CreateLearnableParameter(), so we should really absorb it there
template <class ElemType>
void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly)
// helper of InitLearnableParameters()
// Note: This should really be done through an interface without <ElemType> that LearnableParameter would derive from.
// However, this is only for NDL (which is deprecated), so I rather not pollute the code with more interfaces just for a deprecated cause.
template<class ElemType>
static bool TryPostInitParameters(const ComputationNodeBasePtr& node, const wchar_t* initString, double initValue, unsigned long randomSeed, bool initOnCPUOnly)
{
auto learnableParameterNode = dynamic_pointer_cast<LearnableParameter<ElemType>>(node);
learnableParameterNode->InitRandom(uniformInit, randomSeed + GetRandomSeedOffset(), initValueScale, initOnCPUOnly);
if (!learnableParameterNode)
return false;
learnableParameterNode->PostInitParameters(initString, (ElemType) initValue, randomSeed, initOnCPUOnly);
return true;
}
// non-static version needed because it accesses m_randomSeedOffset
void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& node,
const wchar_t* initString, // "uniform"|"gaussian"|"fixedValue"
double initValue, // scale | scale | value
unsigned long randomSeed /*= 0*/,
bool initOnCPUOnly /*= false*/) const
{
randomSeed += GetRandomSeedOffset();
if (TryPostInitParameters<float> (node, initString, initValue, randomSeed, initOnCPUOnly) ||
TryPostInitParameters<double>(node, initString, initValue, randomSeed, initOnCPUOnly))
return;
LogicError("InitLearnableParameters: Input node is not a LearnableParameter<float or double>");
}
// non-static version needed because it accesses m_randomSeedOffset
// Legacy version that is for random only.
void ComputationNetwork::RandomInitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly) const
{
InitLearnableParameters(node, uniformInit ? L"uniform" : L"gaussian", initValueScale, randomSeed, initOnCPUOnly);
}
bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
@ -714,35 +740,22 @@ void ComputationNetwork::DescribeNetworkUsingDot(list<ComputationArc>& arcs,
File fstream(outFile, FileOptions::fileOptionsText | FileOptions::fileOptionsWrite);
// get precompute node
vector<ComputationNodeBasePtr> PreComputedNodes;
vector<ComputationNodeBasePtr> preComputedNodes;
vector<ComputationNodeBasePtr> pastValueNodes;
vector<ComputationNodeBasePtr> futureValueNodes;
vector<ComputationNodeBasePtr> learnableParameters;
vector<ComputationNodeBasePtr> allnodes = GetAllNodes();
for (const auto& n : allnodes)
{
if (n->RequiresPreCompute())
PreComputedNodes.push_back(n);
}
preComputedNodes.push_back(n);
// get PastValue node
vector<ComputationNodeBasePtr> pastValueNodes;
for (const auto& n : allnodes)
{
if (n->OperationName() == OperationNameOf(PastValueNode) || n->OperationName() == L"Delay")
const auto operationName = n->OperationName();
if (operationName == OperationNameOf(PastValueNode) || operationName == L"Delay"/*legacy*/)
pastValueNodes.push_back(n);
}
// get FuturetValue node
vector<ComputationNodeBasePtr> futureValueNodes;
for (const auto& n : allnodes)
{
if (n->OperationName() == OperationNameOf(FutureValueNode))
else if (operationName == OperationNameOf(FutureValueNode))
futureValueNodes.push_back(n);
}
// get learnableParameters
vector<ComputationNodeBasePtr> learnableParameters;
for (const auto& n : allnodes)
{
if (n->OperationName() == OperationNameOf(LearnableParameter))
else if (operationName == OperationNameOf(LearnableParameter))
learnableParameters.push_back(n);
}
@ -763,7 +776,7 @@ void ComputationNetwork::DescribeNetworkUsingDot(list<ComputationArc>& arcs,
// critera
fstream << FormSpecialNodes(dotcfg.m_CriteriaStyle, m_criterionNodes);
// pre-compute nodes
fstream << FormSpecialNodes(dotcfg.m_PrecomputingNodeStyle, PreComputedNodes);
fstream << FormSpecialNodes(dotcfg.m_PrecomputingNodeStyle, preComputedNodes);
// PastValue nodes
fstream << FormSpecialNodes(dotcfg.m_pastValueNodeStyle, pastValueNodes);
// FutureValue nodes
@ -1062,10 +1075,12 @@ void ComputationNetwork::PerformSVDecomposition(const map<wstring, float>& SVDCo
wstring rightChildName = name + L"_V";
shared_ptr<ComputationNode<ElemType>> pLeft = AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(m_deviceId, leftChildName, m, r));
shared_ptr<ComputationNode<ElemType>> pRight = AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(m_deviceId, rightChildName, r, n));
InitLearnableParameters(pLeft, L"fixedValue", 0); // follow the protocol; otherwise deferred initialization will overwrite the SVD values in validation
InitLearnableParameters(pRight, L"fixedValue", 0);
// TODO: We should be able to move instead of copy but it currently isn't straightforward
// due to redU and redVT being slices
pLeft->ValueAsMatrix() = redU.DeepClone();
pLeft->ValueAsMatrix() = redU.DeepClone();
pRight->ValueAsMatrix() = redVT.DeepClone();
// Step 3. Change the network hierachy to include the SVD nodes
@ -1111,7 +1126,7 @@ public:
~DbnLayer() {};
};
// Save network in the format of the Microsoft-internal legacy "DBN.exe" tool (this function is not useful outside of Microsoft)
// Save network in the format of the Microsoft-internal legacy "DBN.exe" tool (this function is not useful outside of Microsoft).
template <class ElemType>
void ComputationNetwork::SaveToDbnFile(ComputationNetworkPtr net, const std::wstring& fileName) const
{
@ -1463,7 +1478,6 @@ void ComputationNetwork::SaveToDbnFile(ComputationNetworkPtr net, const std::wst
PutTag("EDBN");
}
template void ComputationNetwork::InitLearnableParameters<float>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const float initValueScale, bool initOnCPUOnly);
template void ComputationNetwork::Read<float>(const wstring& fileName);
template void ComputationNetwork::ReadPersistableParameters<float>(File& fstream, bool create);
template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
@ -1473,7 +1487,6 @@ template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net,
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
template void ComputationNetwork::SaveToDbnFile<float>(ComputationNetworkPtr net, const std::wstring& fileName) const;
template void ComputationNetwork::InitLearnableParameters<double>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly);
template void ComputationNetwork::Read<double>(const wstring& fileName);
template void ComputationNetwork::ReadPersistableParameters<double>(File& fstream, bool create);
template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);

Просмотреть файл

@ -332,14 +332,15 @@ public:
// node construction
// -----------------------------------------------------------------------
// non-static version needed because it accesses m_randomSeedOffset
// Excessively used by SimpleNetworkBuilder, but always after CreateLearnableParameter(), so we should really absorb it there
template <class ElemType>
// this function is only for use by NDL (deprecated)
void InitLearnableParameters(const ComputationNodeBasePtr& node,
const bool uniformInit,
const unsigned long randomSeed,
const ElemType initValueScale,
bool initOnCPUOnly = false);
const wchar_t* initString, // "uniform"|"gaussian"|"fixedValue"
double initValue, // scale | scale | value
unsigned long randomSeed = 0,
bool initOnCPUOnly = false) const;
// non-static version needed because it accesses m_randomSeedOffset
// Legacy version that is for random only.
void RandomInitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly = false) const;
template <typename N>
static shared_ptr<N> AsNodePtr(const ComputationNodeBasePtr& inode)
@ -522,6 +523,8 @@ public:
}
const std::vector<ComputationNodeBasePtr>& RootNodes() const { return m_allRoots; }
// these are specified as such by the user
const std::vector<ComputationNodeBasePtr>& FeatureNodes() const { return m_featureNodes ; }
const std::vector<ComputationNodeBasePtr>& LabelNodes() const { return m_labelNodes ; }
@ -751,7 +754,7 @@ public:
while (!result.second/*if already there*/ && result.first->second != node)
{
if (!makeUniqueName || node->NodeName().find_first_of(L".[]") == wstring::npos)
RuntimeError("AddNodeToNetIfNotYet: Duplicated name for %ls %ls operation.", node->NodeName().c_str(), node->OperationName().c_str());
RuntimeError("AddNodeToNetIfNotYet: Duplicated name for %ls %ls operation (%d vs. %d).", node->NodeName().c_str(), node->OperationName().c_str(), (int)node->m_uniqueNumericId, (int)result.first->second->m_uniqueNumericId);
node->SetName(L"_" + node->NodeName());
result = m_nameToNodeMap.insert(make_pair(node->NodeName(), node));
}
@ -1034,7 +1037,7 @@ public:
// data members
// -----------------------------------------------------------------------
unsigned long GetRandomSeedOffset()
unsigned long GetRandomSeedOffset() const
{
return m_randomSeedOffset;
}

Просмотреть файл

@ -106,13 +106,13 @@ void ComputationNetwork::FormRecurrentLoops(const ComputationNodeBasePtr& rootNo
assert(node->m_numNonDelayedParentsInLoop == 0); // (in PurgeStateForFormingRecurrentLoops())
}
for (let& node : nestedNodes)
{
{
for (auto& input : node->GetInputs())
{
{
if (input->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0/*not a Delay node*/)
input->m_numNonDelayedParentsInLoop++; // cound #parents of 'input' that are not delay nodes
}
}
}
// re-traverse the graph for all nestedNodes, starting with the first
// Then update m_nestedNodes with the re-traversed order.

Просмотреть файл

@ -76,7 +76,7 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,
ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);
for (const auto& fromNode : GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
for (const auto& fromNode : fromNet.GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
{
wstring fromNodeName = fromNode->NodeName();
wstring toNodeName = toNamePrefix + fromNodeName;

Просмотреть файл

@ -885,9 +885,9 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
if (performingBackPropagation)
{
if (outputValueNeededDuringBackProp.find(input) == outputValueNeededDuringBackProp.end())
outputValueNeededDuringBackProp[input] = input->OutputUsedInComputingInputNodesGradients();
outputValueNeededDuringBackProp[input] = input->NeedsGradient() && input->OutputUsedInComputingInputNodesGradients();
outputValueNeededDuringBackProp[input] |= node->InputUsedInComputingInputNodesGradients(i);
outputValueNeededDuringBackProp[input] |= (node->NeedsGradient() && node->InputUsedInComputingInputNodesGradients(i));
}
else
{

Просмотреть файл

@ -1,21 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="..\Common\File.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="..\Common\fileutil.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="ComputationNode.cpp">
<Filter>Nodes</Filter>
</ClCompile>
<ClCompile Include="stdafx.cpp">
<Filter>Misc</Filter>
</ClCompile>
<ClCompile Include="..\Common\TimerUtility.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="..\Common\BestGpu.cpp">
<Filter>GPU Interfacing</Filter>
</ClCompile>

Просмотреть файл

@ -380,4 +380,295 @@ public:
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNetworkWithEdits> registerComputationNetworkWithEdits(L"ComputationNetworkWithEdits");
// ===================================================================
// CloneFunctionConfigLambda -- lambda to produce a clone of a network
// - creates a BrainScript function that carbon-copies a subsection of an existing network
// - the copy can be shallow or deep, where a deep copy gets its own copy of LearnableParameters
// - a shallow copy (parameters="shared") is a copy of all nodes that depend on the specified input(s),
// while all other nodes are shared from the original network section
// - a deep copy (parameters="lernable" or "constant") also copies all reachable LearnableParameters and their dependents
// - Input() nodes not listed as `inputNodes` are always shared
// - the source network may be a different network, e.g. loaded with BS.Network.Load()
// - a deep copy can be read-only (parameters="constant")
// - Note: multiple uses of the lambda will not share read-only parameters. This is trickier to implement that one might expect.
// - example use cases:
// - adaptation (KL): a frozen read-only copy of the starting model is used as a KL-regularizer
// - adaptation (DLR): an injected input transform is trained while the network is fixed
// - image: lower layers of ImageNet networks serve as immutable feature extractors for another image task
// - DSSM: applying the same network subsection to two inputs
// Usage:
// f = CloneFunction (inputNodes, outputNodes, parameters="lernable" /*|"constant"|"shared"*/)
// Parameters:
// - inputNodes: single node or array of nodes that will become parameters of the function.
// Commonly, this list will include all Input()s that the outputNode(s) depend on.
// - outputNodes: single node or dictionary of nodes that the function will emit
// Example:
// # create a BS function by copying a piece of network
// net = CloneFunction (network.features, network.logP)
// # apply the copy to a new input
// out = net (myFeatures)
// # This will create a copy of the subsection from network.features to network.logP
// # where all links to network.features get replaced by links to myFeatures.
// Example with multiple input and output nodes:
// # create a BS function by copying a piece of network
// # This specific example converts a network back into a BrainScript function.
// # It passes two input nodes --> the BS function will have 2 inputs;
// # and it passes a record of output nodes --> the BS function will return a record with the same member names
// network = BS.Network.Load ("some.dnn")
// net = CloneFunction ((network.features:network.labels), [ ce = network.ce ; errs = network.errs ])
// # create a network from the BS function
// features = Input (13)
// labels = Input (42)
// out = net (features, labels)
// criterionNodes = (out.ce)
// evaluationNodes = (out.errs)
// A specific example: Adapting a network, while using the original network as a regularizer (KLD)
// # load network
// network = BS.Network.Load ("some.dnn")
// # create a trainable clone and a read-only reference clone
// adaptNet = CloneFunction (network.features, [ z = network.z ], readOnly=false)
// # create a read-only clone
// refNet = CloneFunction (network.features, [ z = network.z ], readOnly=true)
// # create the main network
// features = Input (42)
// labels = Input (9000)
// z = adaptNet (features).z
// zRef = refNet (features).z
// # training criterion
// refWeight = 0.9
// kldLabels = labels * (1-refWeight) + Softmax (zRef) * refWeight # interpolate with ref output
// ce = CrossEntropyWithSoftmax (z, kldLabels)
// errs = ErrorPrediction (z, labels)
// criterionNodes = (ce)
// evaluationNodes = (errs)
// ===================================================================
class CloneFunctionConfigLambda : public ConfigLambda
{
// how we treat the parameters in the clone
enum class ParameterTreatment
{
learnable, // parameters are copied and kept trainable
constant, // parameters are copied and made immutable (e.g. for use of this as a fixed feature extractor)
shared // parameters are shared with where they came from (e.g. for parallel identical paths through a network)
};
public:
// -----------------------------------------------------------------------
// construction
// -----------------------------------------------------------------------
// Executing this function from BrainScript merely sets up a lambda, but does not actually create any clone.
// This is so that the function can be called multiple times in order to create multiple clones.
CloneFunctionConfigLambda(const IConfigRecordPtr configp) :
ConfigLambda(CreateParamNames(*configp), NamedParams(), [this](vector<ConfigValuePtr> &&args, NamedParams &&namedArgs, const std::wstring &exprName){ return this->DoClone(args, exprName); })
{
let& config = *configp;
// input nodes
inputNodes = GetInputNodes(config);
// output nodes
let outputNodesParam = config[L"outputNodes"]; // can be a node or a record
if (outputNodesParam.Is<ComputationNodeBase>()) // scalar case: result is a single node
outputNodes[L""] = outputNodesParam.AsPtr<ComputationNodeBase>(); // indicated by a "" node name in outputNodes[]
else // multi-valued case: result is a record of nodes
{
let& outputNodesRecord = outputNodesParam.AsRef<IConfigRecord>();
for (let& nodeName : outputNodesRecord.GetMemberIds())
outputNodes[nodeName] = outputNodesRecord[nodeName].AsPtr<ComputationNodeBase>();
if (outputNodes.empty())
InvalidArgument("CloneFunction: At least one output nodes must be specified.");
}
// treatment of parameters
wstring parametersOption = config[L"parameters"];
if (parametersOption == L"learnable") parameterTreatment = ParameterTreatment::learnable;
else if (parametersOption == L"constant") parameterTreatment = ParameterTreatment::constant;
else if (parametersOption == L"shared") parameterTreatment = ParameterTreatment::shared;
else InvalidArgument("CloneFunction: 'parameters' option must be 'learnable', 'constant', or 'shared'.");
// determine which nodes must be cloned
// - intersection of:
// - all indirect inputs of the specified outputs
// - all dependents of leaves
// - where leaves are:
// - specified inputs
// - unless parameters="shared": all parameters the specified outputs depend on
// determine all indirect inputs of the specified outputs
vector<ComputationNodeBasePtr> roots;
for (let& outputNodeKV : outputNodes)
roots.push_back(outputNodeKV.second);
let allInputs = ComputationNodeBase::EnumerateNodes(roots);
// take the chance to validate inputNodes
let allInputsSet = set<ComputationNodeBasePtr>(allInputs.begin(), allInputs.end());
for (let& input : inputNodes)
if (allInputsSet.find(input) == allInputsSet.end())
InvalidArgument("CloneFunction: No specified output depends on the specified input %ls.", input->NodeDescription().c_str());
// TODO: Is this really always an error? Are there valid cases where one would over-specify possible input nodes, even if they are not used/needed?
// determine all leaves and their dependents
dependentSet = set<ComputationNodeBasePtr>(inputNodes.begin(), inputNodes.end()); // start with the specified inputs
// determine all leaves and their dependents
for (let& node : allInputs)
{
// add parameters that are to be cloned to dependent set
if (parameterTreatment != ParameterTreatment::shared && node->Is<IFreezable>())
dependentSet.insert(node);
// if at least one input is in the dependent set then this node is, too
else
for (let& input : node->GetInputs())
if (dependentSet.find(input) != dependentSet.end())
dependentSet.insert(node);
}
#if 0
for (let& node : dependentSet)
fprintf(stderr, "CloneFunction: cloning %ls\n", node->NodeDescription().c_str());
#endif
// ensure none of the specified inputs reference back into the cloned set
// The function we extract must be separable.
for (let& input : inputNodes)
for (let& node : ComputationNodeBase::EnumerateNodes(vector<ComputationNodeBasePtr>{input})) // check all indirect inputs of each specified input
{
let iter = dependentSet.find(input);
if (iter != dependentSet.end() && *iter != input)
InvalidArgument("CloneFunction: specified function input %ls recursively depends on %ls inside the function.", input->NodeDescription().c_str(), node->NodeDescription().c_str());
}
}
private:
// get the input nodes from the config
static vector<ComputationNodeBasePtr> GetInputNodes(const IConfigRecord& config)
{
return ScriptableObjects::ConfigArray::FlattenedVectorFrom<ComputationNodeBasePtr>(config[L"inputNodes"]);
}
// create an array of parameter names for all inputs
// These names are never actually used, but required by the ConfigLambda constructor, and maybe useful for debugging.
static vector<wstring> CreateParamNames(const IConfigRecord& config)
{
let inputNodes = GetInputNodes(config);
vector<wstring> paramNames(inputNodes.size());
for (size_t i = 0; i < paramNames.size(); i++)
paramNames[i] = msra::strfun::wstrprintf(L"input_%d", (int)i);
return paramNames;
}
private:
// -----------------------------------------------------------------------
// the cloning operation itself
// -----------------------------------------------------------------------
// execute the lambda
// This will clone all nodes that the outputNodes depend on, and rewire inputs matching inputNodes to inputArgs.
ConfigValuePtr DoClone(const vector<ConfigValuePtr>& inputValues, const std::wstring& exprName)
{
// resolve the input arguments
vector<ComputationNodeBasePtr> inputs;
for (let& inputValue : inputValues)
inputs.push_back(inputValue.ResolveValue());
assert(inputValues.size() == inputNodes.size()); // (this should have been checked by BrainScript)
// do some logging
fprintf(stderr, "CloneFunction: ");
for (size_t i = 0; i < inputs.size(); i++)
fprintf(stderr, "%s%ls : %ls", i == 0 ? "(" : ", ", inputs[i]->NodeName().c_str(), inputs[i]->OperationName().c_str());
fprintf(stderr, ") -> ");
let singleOutput = outputNodes.size() == 1 && outputNodes.begin()->first.empty();
if (singleOutput)
fprintf(stderr, "%ls\n", outputNodes.begin()->second->NodeDescription().c_str());
else
{
fprintf(stderr, "[\n");
for (let& outputNodesKV : outputNodes)
fprintf(stderr, " %ls = %ls : %ls\n", outputNodesKV.first.c_str(), outputNodesKV.second->NodeName().c_str(), outputNodesKV.second->OperationName().c_str());
fprintf(stderr, "]\n");
}
// clone everything in the dependent set
// - specified inputs get mapped to actual parameters
// - all others get duplicated
// Note that at this point, the "shared" option has already been considered,
// and is reflected in whether parameters are included or not in 'dependentSet'.
map<ComputationNodeBasePtr, ComputationNodeBasePtr> clonedNodes;
size_t numCloned = 0;
for (size_t i = 0; i < inputNodes.size(); i++)
clonedNodes[inputNodes[i]] = inputs[i];
for (let& node : dependentSet)
{
// if already there then it's an input that we just mapped above
if (clonedNodes.find(node) != clonedNodes.end())
continue;
// clone
ComputationNodeBasePtr newNode;
let newName = exprName + L"." + node->GetName();
newNode = node->Duplicate(newName, CopyNodeFlags::copyNodeAll);
// make it read-only if desired
if (parameterTreatment == ParameterTreatment::constant && newNode->Is<IFreezable>())
newNode->As<IFreezable>()->FreezeParameters();
// and that's our cloned node
clonedNodes[node] = newNode;
numCloned++;
}
#if 0
for (let& nodeKV : clonedNodes)
fprintf(stderr, "CloneFunction: cloning %ls -> %ls (%d -> %d)\n", nodeKV.first->NodeDescription().c_str(), nodeKV.second->NodeDescription().c_str(), (int)nodeKV.first->m_uniqueNumericId, (int)nodeKV.second->m_uniqueNumericId);
#endif
// all cloned nodes' inputs must be redirected if they reference a node that has been cloned as well
size_t numRelinks = 0; // (statistics: how many inputs have we relinked?)
for (let& clonedNodesKV : clonedNodes)
{
let& node = clonedNodesKV.second;
let& inputs = node->GetInputs();
for (size_t i = 0; i < inputs.size(); i++)
{
fprintf(stderr, "%ls.inputs[%d] = %ls (%d)", node->NodeName().c_str(), (int)i, inputs[i]->NodeName().c_str(), (int)inputs[i]->m_uniqueNumericId);
let iter = clonedNodes.find(inputs[i]);
if (iter == clonedNodes.end())
continue;
// input is also a cloned node: relink
node->SetInput(i, iter->second);
fprintf(stderr, " ==> %ls (%d)\n", inputs[i]->NodeName().c_str(), (int)inputs[i]->m_uniqueNumericId);
numRelinks++;
}
}
fprintf(stderr, "CloneFunction: Cloned %d nodes and relinked %d inputs.\n", (int)numCloned, (int)numRelinks);
// return the result
// - if outputNodes was specified as a single node, return a single node
// - if specified as a record, then return a record with the specified names
if (singleOutput)
{
return NodeToConfigValuePtr(clonedNodes.find(outputNodes.begin()->second)->second);
}
else
{
auto record = make_shared<ConfigRecord>(nullptr, [](const std::wstring & msg){ RuntimeError("CloneFunction: %ls", msg.c_str()); });
for (let& outputNodesKV : outputNodes)
record->Add(outputNodesKV.first, [](const wstring&){}, move(NodeToConfigValuePtr(clonedNodes.find(outputNodesKV.second)->second)));
auto valuep = ConfigValuePtr(record, [](const std::wstring &) { LogicError("CloneFunction: Unexpected failure."); }, exprName);
return valuep;
}
}
ConfigValuePtr NodeToConfigValuePtr(ComputationNodeBasePtr node)
{
assert(node);
auto valuep = ConfigValuePtr(node, [](const std::wstring &) { LogicError("CloneFunction: Unexpected failure."); }, node->NodeName());
return valuep;
}
private:
// parameters
vector<ComputationNodeBasePtr> inputNodes;
map<wstring, ComputationNodeBasePtr> outputNodes;
ParameterTreatment parameterTreatment;
// other
set<ComputationNodeBasePtr> dependentSet; // set of nodes that outputNodes depend on
};
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<CloneFunctionConfigLambda> registerCloneFunctionConfigLambda(L"CloneFunctionConfigLambda");
}}}

Просмотреть файл

@ -38,7 +38,8 @@
#define CNTK_MODEL_VERSION_7 7 // ElemType tag in model file
#define CNTK_MODEL_VERSION_8 8 // DynamicAxis for inputs
#define CNTK_MODEL_VERSION_9 9 // Transpose flag in ConvolutionNode to support deconvolution.
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_9
#define CNTK_MODEL_VERSION_10 10 // Learning rate multiplier for input nodes.
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_10
extern bool g_shareNodeValueMatrices;
@ -184,7 +185,7 @@ protected: // TODO: should be fully encapsulated here
bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
bool m_valueSharable; // a flag is needed for memory share.
// If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters),
// If it is false (e.g., LearnableParameters/InputValue and those nodes are solely induced by LearnableParameters),
// it will never be released to memory pool
private:
bool m_isPartOfLoop; // true if this loop is part of a recurrent loop
@ -289,6 +290,9 @@ public:
m_gradientInitialized(false), m_nodeName(name == L"" ? CreateUniqNodeName() : name)
{
// TODO: should m_learningRateMultiplier be set to 0? Or should every node have a way to add its own say on the learning rate for all its inputs?
// we store a unique numeric number for every node that is constructed, as a debugging aid
static size_t uniqueNumericId = 0;
m_uniqueNumericId = uniqueNumericId++;
}
virtual ~ComputationNodeBase()
{
@ -429,7 +433,18 @@ private:
{
if (HasMBLayout())
LogicError("%ls: Minibatch data cannot be interpreted as a single 2D tensor.", NodeDescription().c_str());
else if (m_sampleLayout.GetRank() < 1 || m_sampleLayout.GetRank() > 2) // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
bool notFlattenableTo2D = false;
for (size_t i = 2; i < m_sampleLayout.GetRank(); ++i)
{
if (!m_sampleLayout.CanFlatten(i))
{
notFlattenableTo2D = true;
break;
}
}
if (m_sampleLayout.GetRank() < 1 || ((m_sampleLayout.GetRank() > 2) && notFlattenableTo2D)) // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
LogicError("%ls: Sample [%s] is not a column vector or matrix (1D or 2D tensor).", NodeDescription().c_str(), string(m_sampleLayout).c_str());
}
public:
@ -441,7 +456,11 @@ public:
size_t GetAsMatrixNumCols() const
{
CheckTensorIsMatrix();
return m_sampleLayout.GetRank() > 1 ? m_sampleLayout[1] : 1; // a column vector is also a Matrix
auto flattenedLayout = m_sampleLayout;
if (flattenedLayout.GetRank() > 2)
flattenedLayout.FlattenTo2DInPlace(1, "GetAsMatrixNumCols()");
return flattenedLayout.GetRank() > 1 ? flattenedLayout[1] : 1; // a column vector is also a Matrix
}
// setting/updating the dimensions of the node
@ -574,8 +593,8 @@ public:
else // a whole vector
{
ScriptableObjects::ConfigArrayPtr inputsArray = *inputsArg;
const auto range = inputsArray->GetIndexRange();
for (int i = range.first; i <= range.second; i++) // pull them. This will resolve all of them.
const auto range = inputsArray->GetIndexBeginEnd();
for (int i = range.first; i < range.second; i++) // pull them. This will resolve all of them.
inputs.push_back(inputsArray->At(i, [](const wstring&) { LogicError("GetInputs: out of bounds index while iterating??"); }));
}
}
@ -833,6 +852,8 @@ public:
// Helper that returns [a x b x c], including dynamic axes.
const std::string ShapeDescription() const;
// debugging helper
size_t m_uniqueNumericId; // (a unique handle for debugging)
protected:
// -----------------------------------------------------------------------
@ -1891,6 +1912,13 @@ public:
struct IRecurrentNode { virtual int GetRecurrenceSteppingDirection() const = 0; };
// =======================================================================
// IFreezable -- nodes that have parameters that can be frozen
// e.g. if a trained model is to be used as a fixed feature extractor for another
// =======================================================================
struct IFreezable { virtual void FreezeParameters() { } };
// =======================================================================
// PreComputedNodeBase -- interface implemented by ComputationNodes that precompute
// TODO: We can use this interface in more places.

Просмотреть файл

@ -139,6 +139,16 @@ public:
fstream << "PoolKind: " << (int)m_poolKind << "\n";
}
TensorShape KernelShape() const { return m_kernelShape; }
TensorShape Strides() const { return m_stride; }
std::vector<bool> Sharing() const { return m_sharing; }
std::vector<bool> AutoPad() const { return m_autoPad; }
TensorShape LowerPad() const { return m_lowerPad; }
TensorShape UpperPad() const { return m_upperPad; }
bool Transpose() const { return m_transpose; }
size_t MaxTempMemSizeInSamples() const { return m_maxTempMemSizeInSamples; }
PoolKind PoolingKind() const { return m_poolKind; }
protected:
TensorShape m_kernelShape;
TensorShape m_mapCount;
@ -148,7 +158,7 @@ protected:
TensorShape m_lowerPad;
TensorShape m_upperPad;
PoolKind m_poolKind;
bool m_transpose;
bool m_transpose; // means de-convolution ...I think
ImageLayoutKind m_imageLayout;
size_t m_maxTempMemSizeInSamples;
@ -339,6 +349,10 @@ public:
size_t mapCount = m_mapCount.GetNumElements();
size_t weightCols = kW * kH * inDims.m_numChannels;
// if mapCount is 0 then take it from the input matrix
if (mapCount == 0)
Input(0)->GetAsMatrixNumRows();
// check/infer input [0] (weights)
// BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper.
Input(0)->ValidateInferInputDimsFrom(TensorShape(mapCount, weightCols));

Просмотреть файл

@ -61,4 +61,109 @@ public:
template class SumColumnElementsNode<float>;
template class SumColumnElementsNode<double>;
// -----------------------------------------------------------------------
// (deprecated) PerDimMeanVarNormalizationNode (feature, mean, invStdDev)
// Computes
// output = (feature - mean) .* invStdDev
// where mean and invStdDev are meant to be single elements while features
// is minibatch data.
// Deprecated since it can be trivially expressed in BrainScript.
// -----------------------------------------------------------------------
template <class ElemType>
class PerDimMeanVarNormalizationNode : public ComputationNode<ElemType>, public NumInputs<3>
{
typedef ComputationNode<ElemType> Base;
UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName()
{
return L"PerDimMeanVarNormalization";
}
public:
DeclareConstructorFromConfigWithNumInputs(PerDimMeanVarNormalizationNode);
PerDimMeanVarNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name)
{
}
virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
{
InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
}
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
{
size_t rank = DetermineElementwiseTensorRank();
auto output = ValueTensorFor(rank, fr);
auto input = Input(0)->ValueTensorFor(rank, fr);
auto mean = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
auto invStdDev = Input(2)->ValueTensorFor(rank, fr.AllowBroadcast());
output.AssignDifferenceOf(input, mean); // output = input - mean
output.AssignElementwiseProductOf(output, invStdDev); // output *= invStdDev
}
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
#if 1
// support for legacy models when the mean and variance vectors were stored as column vectors (N,1)
// This code will copy the shape of Input(0) (source) to Input(1) and Input(2) (target) if:
// 1. The source is a 3-tensor with shape 1x1xM
// 2. The target is a vector (i.e., a 2-tensor with shape Nx1)
// 3. Both targets have the same number of elements
// 4. The number of elements in the target (N) is the same as the number of elements in the source (M)
// Note: This is somewhat ugly [Jasha Droppo].
auto dimsA = Input(0)->GetSampleLayout().GetDims();
auto dimsB = Input(1)->GetSampleLayout().GetDims();
auto dimsC = Input(2)->GetSampleLayout().GetDims();
if (
// Test condition 1.
(dimsA.size() == 3 && dimsA[0] == 1 && dimsA[1] == 1) &&
// Test condition 2.
(dimsB.size() == 2 && dimsB[1] == 1) &&
(dimsC.size() == 2 && dimsC[1] == 1) &&
// Test condition 3. and condition 4.
(dimsB[0] == dimsC[0] && dimsB[0] == dimsA[2])
)
{
// for error messages
string dimsBstring = string(Input(1)->GetSampleLayout());
string dimsCstring = string(Input(2)->GetSampleLayout());
// reshape Input(1)
Input(1)->SetDims(TensorShape(dimsA), false);
fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of second input (%ls %ls operation) was patched to [%s] (from [%s])\n",
NodeName().c_str(), OperationName().c_str(), Input(1)->NodeName().c_str(), Input(1)->OperationName().c_str(), string(Input(1)->GetSampleLayout()).c_str(), dimsBstring.c_str());
// reshape Input(2)
Input(2)->SetDims(TensorShape(dimsA), false);
fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of third input (%ls %ls operation) was patched to [%s] (from [%s])\n",
NodeName().c_str(), OperationName().c_str(), Input(2)->NodeName().c_str(), Input(2)->OperationName().c_str(), string(Input(2)->GetSampleLayout()).c_str(), dimsCstring.c_str());
}
#endif
if (isFinalValidationPass)
{
if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
}
SetDims(Input(0));
}
};
template class PerDimMeanVarNormalizationNode<float>;
template class PerDimMeanVarNormalizationNode<double>;
}}}

Просмотреть файл

@ -18,47 +18,107 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// TODO: add -Node to the class name
// -----------------------------------------------------------------------
// BUGBUG: If called after random init, this will reset to 0.
// TODO: Need to remember the init parameters, and do it here.
template <class ElemType>
void LearnableParameter<ElemType>::InitShape(const TensorShape& shape)
{
SetDims(shape, false);
UpdateFunctionValuesSize(); // this allocates the matrix
Value().SetValue(0); // TODO: invalidate instead
Value().Invalidate();
}
// constructor from config
// Parameterization is a little wicked. An older version required to specify the type of initialization
// ("uniform|fixedValue|gaussian|fromFile|fromLiteral") and then a parameter with a matching name.
// Now, only the matching parameter is sufficient, making it less verbose.
// - init="uniform|gaussian" (random init, scaled by arg initValueScale)
// - init="zero"
// - initValue=scalar --> initialize from this value
// - initValue=array or nested array --> initialize from this value, infer dimensions --TODO: not implemented yet
// - initFromFilePath="..." --> read from a data file. This infers the dimensions from the file.
// deprecated:
// - init="fixedValue", value from 'value' --deprecated in favor of just specifying initValue
// - init="fromFile", value from 'initFromFilePath' --deprecated in favor of just specifying 'initFromFilePath'
// - init="fromLiteral", value from 'initFromLiteral' --deprecated in favor of initValue=array expression
// The forms that infer the dimensions have different BrainScript names. TODO: need one for fromFile
// TODO: All forms that require specified dimensions but contain zeroes (to be updated by graph)
// will need to do deferred initialization, or have a way to repeat it.
template <class ElemType>
LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfigRecordPtr configp) :
LearnableParameter(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"shape"))
{
// TODO: Change dimensions to take a generic tensor instead. That will be a (minor) breaking change that will require fix-ups when converting from NDL to BrainScript.
AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
// parameters[rows, [cols=1]] plus other optional parameters (learningRateMultiplier=[1|0|float], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float])
AttachInputsFromConfig(configp, this->GetExpectedNumInputs()); // (we have none; this checks that none are provided)
// Parameter{dims, other optional parameters: learningRateMultiplier=[1|0|float], init=[uniform|gaussian|], initValueScale=[1|float], initValue=[''|float], initFromFilePath=[''|string]}
// constant vs. parameter (with optional LR scaling)
if (configp->Exists(L"learningRateMultiplier"))
SetLearningRateMultiplier(configp->Get(L"learningRateMultiplier"));
else if (configp->Exists(L"needsGradient") || configp->Exists(L"needGradient") || configp->Exists(L"computeGradient"))
InvalidArgument("Deprecated parameter names needsGradient|needGradient|computeGradient are not supported in BrainScript. Use learningRateMultiplier instead.");
// initialization
wstring initString = configp->Get(L"init");
if (initString == L"fixedValue")
Value().SetValue((ElemType) configp->Get(L"value"));
else if (initString == L"uniform" || initString == L"gaussian")
wstring initFromFilePath = configp->Get(L"initFromFilePath");
let& initValue = configp->Get(L"initValue"); // may be empty string, scalar, or array
// infer the type of the initial value from what other optional args are given
if (initString.empty())
{
// TODO: add these options also to old NDL
if (!initFromFilePath.empty()) // 'initFromFilePath' given --> initialize from file
initString = L"fromFile"; // (note: this is only used internally; external use is deprecated)
else if (!initValue.Is<ScriptableObjects::String>()) // 'initValue' given (not an empty string) --> initialize from value
{
if (initValue.Is<ScriptableObjects::Double>())
initString = L"fromValue"; // (note: this is only used internally)
else if (initValue.Is<ScriptableObjects::ConfigArray>())
initString = L"fromValueArray"; // (note: this is only used internally)
else
InvalidArgument("'initValue' must be numerical");
}
else if (!initValue.AsRef<ScriptableObjects::String>().empty()) // it's a string: must be empty
InvalidArgument("LearnableParameter: 'initValue' must be an empty string or not a string.");
else // no pertinent optional arguments given: default to 'uniform'
initString = L"uniform"; // default is uniform
}
// deferred variants
// Deferred means that this kind of initialization is allowed when some dimensions are unspecified, and thus happens during Validate().
if (initString == L"uniform" || initString == L"gaussian") // random init
{
m_initString = initString;
// TODO: add more randomization types, and use a more meaningful scaling
// Keras uses "normal" instead of "gaussian". We can use that here too to denote the one with sane scaling, and deprecate "gaussian" with a warning.
static unsigned long randomSeed = 1;
int forcedRandomSeed = configp->Get(L"randomSeed"); // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
InitRandom((initString == L"uniform"), forcedRandomSeed < 0 ? randomSeed++ : (unsigned long) forcedRandomSeed, configp->Get(L"initValueScale"), configp->Get(L"initOnCPUOnly"));
m_randomSeed = forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed;
m_initValueScale = configp->Get(L"initValueScale");
m_initOnCPUOnly = configp->Get(L"initOnCPUOnly");
}
else if (initString == L"fromFile")
else if (initString == L"zero")
{
m_initString = L"fromValue";
m_initValue = 0;
}
else if (initString == L"fromValue") // from 'initValue'
{
m_initString = initString;
m_initValue = initValue;
}
// non-deferred variants
// For the dimensions are always known at this point, so we don't need/want to have to save all those parameters.
else if (initString == L"fromValueArray") // from 'initValue' which has array form
InvalidArgument("'initValue' for arrays not yet implemented"); // array not yet implemented
else if (initString == L"fromFile") // load from 'iniFromFilePath'
{
wstring initFromFilePath = configp->Get(L"initFromFilePath");
if (initFromFilePath.empty())
RuntimeError("initFromFilePath parameter must be provided when using \"fromFile\" initialization method");
InitFromFile(initFromFilePath);
m_initString.clear();
}
else if (initString == L"fromLiteral")
// legacy
else if (initString == L"fixedValue") // deprecated. Use initValue=... instead
{
m_initString = L"fromValue";
m_initValue = (ElemType)configp->Get(L"value");
}
else if (initString == L"fromLiteral") // deprecated. Use initValue=array instead
{
wstring initFromLiteral = configp->Get(L"initFromLiteral");
if (initFromLiteral.empty())
@ -66,9 +126,49 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
size_t numRows, numCols;
auto array = File::LoadMatrixFromStringLiteral<ElemType>(msra::strfun::utf8(initFromLiteral), numRows, numCols);
InitFromArray(array, numRows, numCols);
m_initString.clear();
}
else
RuntimeError("init must be one of the values of [ uniform | gaussian | fixedValue | fromFile ]");
// initialize
// This will be repeated if the matrix gets resized due to dimension inference.
LazyInitParameters();
if (!m_initString.empty())
fprintf(stderr, "%ls: Initializating Parameter[%s] as %ls later when dimensions are fully known.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str());
}
// variant of above from NDL. Must be called right after plain constructor.
// This overwrites any pending deferred initialization with a new one.
// Initialization is done immediately if all dimensions are already known, otherwise kept pending.
template <class ElemType>
void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString, // "uniform"|"gaussian"|"fixedValue"
ElemType initValue, // scale | scale | value
unsigned long randomSeed /*= 0*/,
bool initOnCPUOnly /*= false*/)
{
if (initString == L"uniform" || initString == L"gaussian") // random init
{
m_initString = initString;
m_randomSeed = randomSeed;
m_initValueScale = initValue;
m_initOnCPUOnly = initOnCPUOnly;
}
else if (initString == L"fixedValue") // from constant value
{
m_initString = L"fromValue";
m_initValue = initValue;
}
else
LogicError("PostInitParameters: invalid init string '%ls'", m_initString.c_str());
// initialize
// This will be repeated if the matrix gets resized due to dimension inference.
LazyInitParameters();
if (!m_initString.empty())
fprintf(stderr, "%ls: Initializating Parameter[%s] as %ls later when dimensions are fully known.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str());
}
// initialize with random numbers
@ -162,9 +262,25 @@ void LearnableParameter<ElemType>::InitFromArray(const std::vector<ElemType>& ar
VerifyDataSize(Value()); // sanity check
}
// TODO: Move this error check there, since this is called only from one place.
template <class ElemType>
void LearnableParameter<ElemType>::ReviseFromFile(const std::wstring& reviseFromFilePath)
{
try
{
InitFromFile(reviseFromFilePath);
}
catch (const std::exception & e)
{
RuntimeError("ReviseFromFile: Failed to reload %ls %ls operation from file %ls: %s", NodeName().c_str(), OperationName().c_str(), reviseFromFilePath.c_str(), e.what());
}
}
template <class ElemType>
void LearnableParameter<ElemType>::Save(File& fstream) const /*override*/
{
if (!m_initString.empty())
LogicError("LearnableParameter: Cannot Save() before deferred initialization has completed.");
Base::Save(fstream);
fstream << m_learningRateMultiplier;
m_sampleLayout.Save(fstream);
@ -204,12 +320,31 @@ void LearnableParameter<ElemType>::Load(File& fstream, size_t modelVersion) /*ov
LoadValue(fstream);
SetDims(sampleLayout, false); // note: call this after LoadValue() since LoadValue() overwrites m_sampleLayout
VerifyDataSize(Value()); // sanity check
m_initString.clear(); // deferred initialization not possible after loading
}
template <class ElemType>
/*virtual*/ void LearnableParameter<ElemType>::CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const /*override*/
{
Base::CopyTo(nodeP, newName, flags);
if (flags & CopyNodeFlags::copyNodeValue)
{
auto node = dynamic_pointer_cast<LearnableParameter<ElemType>>(nodeP);
node->m_initString = m_initString;
node->m_randomSeed = m_randomSeed;
node->m_initValueScale = m_initValueScale;
node->m_initOnCPUOnly = m_initOnCPUOnly;
node->m_initValue = m_initValue;
}
}
// computation functions don't do anything for parameter nodes
template <class ElemType>
/*virtual*/ void LearnableParameter<ElemType>::UpdateFunctionMBSize() /*override*/
{
if (!m_initString.empty())
LogicError("LearnableParameter: Deferred initialization has not been completed until first call to UpdateFunctionMBSize().");
}
template <class ElemType>
@ -226,18 +361,70 @@ template <class ElemType>
template <class ElemType>
/*virtual*/ void LearnableParameter<ElemType>::Validate(bool isFinalValidationPass) /*override*/
{
//fprintf(stderr, "Validate %ls: called in init state '%ls' with dims [%s]\n", NodeDescription().c_str(), m_initString.c_str(), string(GetSampleLayout()).c_str());
Base::Validate(isFinalValidationPass);
m_pMBLayout = nullptr; // this node does not hold mini-batch data
// lazy init if we got a dimension now
#if 0 // fake old buggy behavior before deferred initialization
if (isFinalValidationPass && !m_initString.empty() && (m_initString != L"fromValue" || m_initValue != 0))
{
fprintf(stderr, "Validate: deferred '%ls' initialization patched to fromValue 0 for back compat\n", m_initString.c_str());
m_initString = L"fromValue";
m_initValue = 0;
}
#endif
#if 0
// We call this here and in Validate(true), since we don't know which gets called first.
// TODO: Actually this should never be needed, because each time dimensions change, we init.
// So if we get here without fully-known dimensions, this call won't do anything either.
if (isFinalValidationPass)
LazyInitParameters();
#endif
}
// deferred initialization
// We support a feature that some dimensions can be specified as 0, and get inferred.
// This is only possible for initialization methods that do not come with their own dimensions
// (such as initialization from an array literal).
// When initialization succeeded (all dimensions known), the pending initialization is cleared.
// This is called from constructor and InferInputDimsFrom().
// BUGBUG: We cannot really enforce the calling sequence. Save() verifies that this has been cleared.
// Note that this may be called AFTER Validate(true) (still during validation, but after final validation of this node).
template <class ElemType>
void LearnableParameter<ElemType>::LazyInitParameters()
{
// if no lazy init pending then we are done
if (m_initString.empty())
return;
// if not all dimensions are known yet, we cannot proceed: keep it pending
if (GetSampleLayout().GetNumElements() == 0)
return;
// OK, proceed
if (m_initString == L"fromValue")
{
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %f.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initValue);
Value().SetValue(m_initValue);
}
else if (m_initString == L"uniform" || m_initString == L"gaussian")
{
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, scale=%f, onCPU=%s).\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(), (int)m_randomSeed, m_initValueScale, m_initOnCPUOnly ? "true" : "false");
InitRandom((m_initString == L"uniform"), m_randomSeed, m_initValueScale, m_initOnCPUOnly);
}
else
LogicError("LearnableParameter: Invalid value of m_initString '%ls' for deferred initialization for %ls.", m_initString.c_str(), NodeDescription().c_str());
// and remember that we are done
m_initString.clear();
}
// called from ComputationNode::ValidateInferInputDimsFrom()
// In case of an error, this function just backs out without updating.
// The caller must verify the dimensions.
// This is a bit weird since it is called after this node has been Validated once.
// BUGBUG: This will clear out any random initialization to 0. So currently this is not usable for most cases.
template <class ElemType>
void LearnableParameter<ElemType>::InferInputDimsFrom(const TensorShape& otherShape)
{
//fprintf(stderr, "InferInputDimsFrom %ls: called in init state '%ls' with dims [%s], offered new dims [%s]\n", NodeDescription().c_str(), m_initString.c_str(), string(GetSampleLayout()).c_str(), string(otherShape).c_str());
const auto& thisShape = GetSampleLayout();
// see where we stand with our shape
@ -248,7 +435,10 @@ void LearnableParameter<ElemType>::InferInputDimsFrom(const TensorShape& otherSh
// infer at least one dimension
if (otherShape.GetRank() == 0 || otherShape.GetNumElements() == 0)
return; // LogicError("ValidateInferInputDimsFrom: Inferred dimensions must not be empty.");
if (m_initString.empty())
LogicError("InferInputDimsFrom: Attempted to infer dimensions, with initialization completed or no deferred initialization pending.");
// if no dimensions have been set at all, copy otherShape
// Don't verify dimensions in this case, because the node may have explicitly been defined as a vector of 0 elements.
bool hasAnyDim = false;
@ -266,7 +456,20 @@ void LearnableParameter<ElemType>::InferInputDimsFrom(const TensorShape& otherSh
newDims[i] = otherShape[i];
InitShape(TensorShape(newDims));
}
fprintf(stderr, "%ls %ls operation: Tensor shape was inferred as [%s].\n", NodeName().c_str(), OperationName().c_str(), string(GetSampleLayout()).c_str());
fprintf(stderr, "%ls operation: Tensor shape was inferred as [%s].\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str());
// initialize the values
// We call this here and in Validate(true), since we don't know which gets called first.
// Note: It seems that this is not necessary, and that Validate(true) is only called after inference.
#if 0 // fake old buggy behavior before deferred initialization
if (m_initString != L"fromValue" || m_initValue != 0)
{
fprintf(stderr, "InferInputDimsFrom: deferred '%ls' initialization patched to fromValue 0 for back compat\n", m_initString.c_str());
m_initString = L"fromValue";
m_initValue = 0;
}
#endif
LazyInitParameters();
}
template <class ElemType>
@ -286,6 +489,12 @@ template <class ElemType>
PrintNodeValuesToFile(printValues, printMetadata, fstream);
}
template <class ElemType>
/*virtual*/ void LearnableParameter<ElemType>::FreezeParameters() /*override*/ // from IFreezable
{
SetLearningRateMultiplier(0);
}
template class LearnableParameter<float>;
template class LearnableParameter<double>;

Просмотреть файл

@ -21,7 +21,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
template <class ElemType>
class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>, public IFreezable
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName() { return L"LearnableParameter"; }
@ -29,69 +29,57 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
void InitShape(const TensorShape& shape);
public:
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name)
// this constructor is always run (all other constructors call this one)
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name) :
Base(deviceId, name)
{
SetLearningRateMultiplier(1.0f); // enable normal learning by default
MarkValueNonSharable();
m_initString = L"fromValue"; // default init is with 0; typically overwritten
m_initValue = 0;
}
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& shape)
: Base(deviceId, name)
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& shape) :
LearnableParameter(deviceId, name)
{
SetLearningRateMultiplier(1.0f);
MarkValueNonSharable();
InitShape(shape);
LazyInitParameters();
}
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, size_t rows, size_t cols)
: LearnableParameter(deviceId, name, TensorShape(rows, cols))
LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, size_t rows, size_t cols) :
LearnableParameter(deviceId, name, TensorShape(rows, cols))
{
}
LearnableParameter(const ScriptableObjects::IConfigRecordPtr configp);
// initialize with random numbers
// if 'initOnCPUOnly' then always init on CPU, making initialization consistent across both (for testing)
void InitRandom(const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly);
// initialize after plain constructor; for use by NDL
void PostInitParameters(const std::wstring& initString, // "uniform"|"gaussian"|"fixedValue"
ElemType initValue, // scale | scale | value
unsigned long randomSeed = 0,
bool initOnCPUOnly = false);
// initialize by reading a matrix from a text file
void InitFromFile(const std::wstring& initFromFilePath);
private:
// initialize with random numbers
// If 'initOnCPUOnly' then always init on CPU, making initialization consistent across both (for testing).
void InitRandom(const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly);
// helper to initialize from a matrix read from a text file or a string literal
void InitFromArray(const std::vector<ElemType>& array, size_t numRows, size_t numCols);
// deferred initialization
void LazyInitParameters();
public:
// reload parameters from file
// This is called from MEL.
// TODO: Move this error check there, since this is called only from one place.
void ReviseFromFile(const std::wstring& reviseFromFilePath)
{
#if 1
try
{
InitFromFile(reviseFromFilePath);
}
catch(const std::exception & e)
{
RuntimeError("ReviseFromFile: Failed to reload %ls %ls operation from file %ls: %s", NodeName().c_str(), OperationName().c_str(), reviseFromFilePath.c_str(), e.what());
}
#else
size_t numRows, numCols;
auto array = File::LoadMatrixFromTextFile<ElemType>(reviseFromFilePath, numRows, numCols);
size_t nRows, nCols;
DetermineDataSize(nRows, nCols); // BUGBUG: private
if (numRows != nRows || numCols != nCols)
{
RuntimeError("Error in ReviseFromFile for node %ls using file %ls: original size (%d x %d) vs current size (%d x %d)",
m_nodeName.c_str(), reviseFromFilePath.c_str(), (int) nRows, (int) nCols, (int) numRows, (int) numCols);
}
Value().SetValue(numRows, numCols, m_deviceId, array.data(), matrixFlagNormal);
VerifyDataSize(Value()); // sanity check
#endif
}
void ReviseFromFile(const std::wstring& reviseFromFilePath);
virtual void Save(File& fstream) const override;
virtual void Load(File& fstream, size_t modelVersion) override;
virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override;
// computation functions don't do anything for parameter nodes
virtual void UpdateFunctionMBSize() override;
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange&) override;
@ -106,6 +94,17 @@ public:
void InferInputDimsFrom(const TensorShape& otherShape);
virtual void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override;
// called from CloneFunction(..., parameters="constant")
virtual void FreezeParameters() override; // from IFreezable
private:
// init parameters for deferred initialization (which happens in Validate())
std::wstring m_initString; // if non-empty then deferred initialization is needed. Gets cleared upon completion of deferred init.
unsigned long m_randomSeed;
ElemType m_initValueScale;
bool m_initOnCPUOnly;
ElemType m_initValue;
};
// -----------------------------------------------------------------------
@ -162,7 +161,7 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>, pu
typedef ComputationNode<ElemType> Base;
UsingComputationNodeMembers;
void Init(const TensorShape& sampleLayout, bool isSparse, const std::wstring axisName)
void Init(const TensorShape& sampleLayout, bool isSparse, const std::wstring axisName, float learningRateMultiplier = 0)
{
m_isSparse = isSparse;
MarkValueNonSharable();
@ -171,7 +170,7 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>, pu
SetDims(sampleLayout, HasMBLayout()); // also called when reloading a file. Then we have an MBLayout, otherwise not yet
UpdateFunctionValuesSize(); // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
SetLearningRateMultiplier(0);
SetLearningRateMultiplier(learningRateMultiplier);
m_dynamicAxisNodeName = axisName;
}
@ -225,9 +224,9 @@ protected:
Init(ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))), isSparse, axisName);
}
public:
virtual const std::wstring GetRequestedDynamicAxis() const { return m_dynamicAxisNodeName; }
public:
virtual void Save(File& fstream) const override
{
Base::Save(fstream);
@ -239,6 +238,8 @@ public:
unsigned int nrAxes = 1;
fstream << nrAxes;
fstream << m_dynamicAxisNodeName;
fstream << m_learningRateMultiplier;
}
virtual void Load(File& fstream, size_t modelVersion) override
@ -268,7 +269,12 @@ public:
}
else
m_dynamicAxisNodeName = L""; // Use default
Init(sampleLayout, m_isSparse, m_dynamicAxisNodeName);
float learningRateMultiplier = 0;
if (modelVersion >= CNTK_MODEL_VERSION_10)
fstream >> learningRateMultiplier;
Init(sampleLayout, m_isSparse, m_dynamicAxisNodeName, learningRateMultiplier);
}
// InputValue must not resize its inputs because that might destroy it. It should already have the correct size.

Просмотреть файл

@ -463,6 +463,8 @@ public:
Base::AllocateGradientMatricesForInputs(matrixPool);
}
size_t OutputRank() const { return m_outputRank; }
private:
size_t m_outputRank;
};

Просмотреть файл

@ -376,117 +376,12 @@ private:
template class InvStdDevNode<float>;
template class InvStdDevNode<double>;
// -----------------------------------------------------------------------
// PerDimMeanVarNormalizationNode (feature, mean, invStdDev)
// Computes
// output = (feature - mean) .* invStdDev
// where mean and invStdDev are meant to be single elements while features
// is minibatch data.
// TODO: Why do we need this? Why not use Plus and ElementTimes?
// -----------------------------------------------------------------------
template <class ElemType>
class PerDimMeanVarNormalizationNode : public ComputationNode<ElemType>, public NumInputs<3>
{
typedef ComputationNode<ElemType> Base;
UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName()
{
return L"PerDimMeanVarNormalization";
}
public:
DeclareConstructorFromConfigWithNumInputs(PerDimMeanVarNormalizationNode);
PerDimMeanVarNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name)
{
}
virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
{
InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
}
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
{
size_t rank = DetermineElementwiseTensorRank();
auto output = ValueTensorFor(rank, fr);
auto input = Input(0)->ValueTensorFor(rank, fr);
auto mean = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
auto invStdDev = Input(2)->ValueTensorFor(rank, fr.AllowBroadcast());
output.AssignDifferenceOf(input, mean); // output = input - mean
output.AssignElementwiseProductOf(output, invStdDev); // output *= invStdDev
}
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
#if 1
// support for legacy models when the mean and variance vectors were stored as column vectors (N,1)
// This code will copy the shape of Input(0) (source) to Input(1) and Input(2) (target) if:
// 1. The source is a 3-tensor with shape 1x1xM
// 2. The target is a vector (i.e., a 2-tensor with shape Nx1)
// 3. Both targets have the same number of elements
// 4. The number of elements in the target (N) is the same as the number of elements in the source (M)
// Note: This is somewhat ugly [Jasha Droppo].
auto dimsA = Input(0)->GetSampleLayout().GetDims();
auto dimsB = Input(1)->GetSampleLayout().GetDims();
auto dimsC = Input(2)->GetSampleLayout().GetDims();
if (
// Test condition 1.
(dimsA.size() == 3 && dimsA[0] == 1 && dimsA[1] == 1) &&
// Test condition 2.
(dimsB.size() == 2 && dimsB[1] == 1) &&
(dimsC.size() == 2 && dimsC[1] == 1) &&
// Test condition 3. and condition 4.
(dimsB[0] == dimsC[0] && dimsB[0] == dimsA[2])
)
{
// for error messages
string dimsBstring = string(Input(1)->GetSampleLayout());
string dimsCstring = string(Input(2)->GetSampleLayout());
// reshape Input(1)
Input(1)->SetDims(TensorShape(dimsA), false);
fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of second input (%ls %ls operation) was patched to [%s] (from [%s])\n",
NodeName().c_str(), OperationName().c_str(), Input(1)->NodeName().c_str(), Input(1)->OperationName().c_str(), string(Input(1)->GetSampleLayout()).c_str(), dimsBstring.c_str());
// reshape Input(2)
Input(2)->SetDims(TensorShape(dimsA), false);
fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of third input (%ls %ls operation) was patched to [%s] (from [%s])\n",
NodeName().c_str(), OperationName().c_str(), Input(2)->NodeName().c_str(), Input(2)->OperationName().c_str(), string(Input(2)->GetSampleLayout()).c_str(), dimsCstring.c_str());
}
#endif
if (isFinalValidationPass)
{
if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
}
SetDims(Input(0));
}
};
template class PerDimMeanVarNormalizationNode<float>;
template class PerDimMeanVarNormalizationNode<double>;
// -----------------------------------------------------------------------
// PerDimMeanVarDeNormalizationNode (feature, mean, invStdDev)
// Computes
// output = feature ./ invStdDev + mean
// with parameters the same as PerDimMeanVarNormalizationNode.
// TODO: Why do we need this? Why not use Plus and ElementDividedBy?
// TODO: Deprecate like PerDimMeanVarNormalizationNode as soon as we have a test case. Or just delete it.
// -----------------------------------------------------------------------
template <class ElemType>

Просмотреть файл

@ -464,6 +464,9 @@ public:
LogicError("Unrecognized direction in DelayedValueNodeBase");
}
int TimeStep() const { return m_timeStep; }
ElemType InitialActivationValue() const { return m_initialActivationValue; }
protected:
ElemType m_initialActivationValue; // starting value for hidden activation vector at boundary
Matrix<ElemType> m_delayedValue; // saves the activation of the previous step that this node points to

Просмотреть файл

@ -34,9 +34,9 @@ template <class ElemType>
if (flags & CopyNodeFlags::copyNodeValue)
{
auto node = dynamic_pointer_cast<ReduceElementsNode<ElemType>>(nodeP);
node->m_axis = m_axis;
node->m_operation = m_operation;
node->m_op = m_op;
node->m_axis = m_axis;
node->m_operation = m_operation;
node->m_reductionOp = m_reductionOp;
}
}
@ -64,7 +64,7 @@ template <class ElemType>
auto input = Input(0)->ValueTensorFor(rank, fr);
// the actual operation is a Copy with reduction, where the magic is in the reduction op
result.DoUnaryOpOf(0, input, 1, ElementWiseOperator::opCopy, m_op);
result.DoUnaryOpOf(0, input, 1, ElementWiseOperator::opCopy, m_reductionOp);
// note: we can implement "Mean" by passing 1/dim for alpha
}
@ -79,13 +79,46 @@ template <class ElemType>
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
// gradients are not as simple as passing an op-code, unfortunately
switch (m_op)
switch (m_reductionOp)
{
case ElementWiseOperator::opSum:
// "Sum": broadcast the gradient
sliceInputGrad.AddCopyOf(sliceOutputGrad);
break;
case ElementWiseOperator::opLogSum:
{
auto input = Input(inputIndex)->ValueTensorFor(rank, fr);
auto output = ValueTensorFor(rank, fr.AllowBroadcast());
// Let: f(x, y, z) = log(exp x + exp y + exp z)
// For the derivative we get:
// df / dx = exp(x)/exp(f)
// = exp(x – f)
sliceInputGrad.AddElementwiseProductWithExpOfDiffOf(sliceOutputGrad, input, output);
}
break;
case ElementWiseOperator::opMin:
case ElementWiseOperator::opMax:
auto input = Input(inputIndex)->ValueTensorFor(rank, fr);
auto output = ValueTensorFor(rank, fr.AllowBroadcast());
// POTENTIAL PROBLEM:
// For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points.
// E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time.
// In these cases there is no correct gradient.The question is if this could lead to any problems.
// Let's look at two scenarios where this might happen:
//
// * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value.
// In this case it's not a problem that the input gradient is kind of bad as the derivative of the concerning input nodes will be zero anyway.
//
// * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as we wouldn't wan't to propagate gradients to the training data.
//
// So as we don't have a better solution yet and it probably doesn't have impact let's stay with the current solution.
// Also note that for Clip , Min, Max and ReLU we have the same kind of problem.
sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad);
break;
// more coming
// "LogPlus": softmax
@ -93,18 +126,18 @@ template <class ElemType>
// df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i - ReduceLogPlus(x))
// targetGradient = gradientFromTop .* Exp (inputValue - outputValue) --TODO: verify
// i.e. compute dfference if input and output, then Exp in-place. No, would need temp memory. So needs its own opcode AddScaledExpOfDiff(). Ternary.
// "Max": Copy the gradient only to the max value. targetGradient += gradientFromTop .* (outputValue == inputValue). Needs its own opcode. --TODO : verify
}
}
template <class ElemType>
/*virtual*/ bool ReduceElementsNode<ElemType>::OutputUsedInComputingInputNodesGradients() const /*override*/
{
switch (m_op)
switch (m_reductionOp)
{
case ElementWiseOperator::opSum: return false;
// will be different e.g. for LogPlus, Max, and Min
case ElementWiseOperator::opSum: return false;
case ElementWiseOperator::opLogSum: return true;
case ElementWiseOperator::opMin: return true;
case ElementWiseOperator::opMax: return true;
}
LogicError("Should not get here.");
}
@ -112,25 +145,31 @@ template <class ElemType>
template <class ElemType>
/*virtual*/ bool ReduceElementsNode<ElemType>::InputUsedInComputingInputNodesGradients(size_t inputIndex) const /*override*/
{
switch (m_op)
switch (m_reductionOp)
{
case ElementWiseOperator::opSum: return false;
// will be different for LogPlus, Max, and Min
case ElementWiseOperator::opSum: return false;
case ElementWiseOperator::opLogSum: return true;
case ElementWiseOperator::opMin: return true;
case ElementWiseOperator::opMax: return true;
}
LogicError("Should not get here.");
}
// map the operation specific as a string to an ElementWiseOperator to pass to
// map the operation specified as a string to an ElementWiseOperator value.
template <class ElemType>
void ReduceElementsNode<ElemType>::ValidateOp()
{
#if 1 // legacy with initial experiments, delete this soon
if (m_operation == L"Plus") m_op = ElementWiseOperator::opSum;
if (m_operation == L"Plus") m_reductionOp = ElementWiseOperator::opSum;
else
#endif
if (m_operation == L"Sum") m_op = ElementWiseOperator::opSum;
if (m_operation == L"Sum") m_reductionOp = ElementWiseOperator::opSum;
else if (m_operation == L"LogSum") m_reductionOp = ElementWiseOperator::opLogSum;
else if (m_operation == L"Min") m_reductionOp = ElementWiseOperator::opMin;
else if (m_operation == L"Max") m_reductionOp = ElementWiseOperator::opMax;
// more here
else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Sum'. And a few more soon.", NodeDescription().c_str(), m_operation.c_str());
else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Sum', 'Max', 'Min'.", NodeDescription().c_str(), m_operation.c_str());
}
template <class ElemType>

Просмотреть файл

@ -196,7 +196,7 @@ class ReduceElementsNode : public ComputationNode<ElemType>, public NumInputs<1>
void ValidateOp();
public:
ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation = std::wstring(), int axis = 0) :
Base(deviceId, name), m_operation(operation), m_axis(axis), m_op((ElementWiseOperator)-1/*invalid*/)
Base(deviceId, name), m_operation(operation), m_axis(axis), m_reductionOp((ElementWiseOperator)-1/*invalid*/)
{
if (!m_operation.empty()) // verify validity already here out of courtesy (would otherwise be caught in Validate())
ValidateOp();
@ -220,7 +220,7 @@ public:
private:
int m_axis;
std::wstring m_operation; // the operation as a string, e.g. "Sum", see ValidateOp()
ElementWiseOperator m_op; // the operation mapped to our internal opCode
ElementWiseOperator m_reductionOp; // the reduction operation mapped to our internal opCode
};
// -----------------------------------------------------------------------

Просмотреть файл

@ -28,7 +28,7 @@ TraceNode<ElemType>::TraceNode(const ScriptableObjects::IConfigRecordPtr configp
m_message = (const std::wstring&)configp->Get(L"say");
m_logFirst = configp->Get(L"logFirst");
m_logFrequency = configp->Get(L"logFrequency");
m_logGradientToo = false; // configp->Get(L"logGradientToo"); not yet implemented
m_logGradientToo = configp->Get(L"logGradientToo");
m_formattingOptions = WriteFormattingOptions(*configp);
m_onlyUpToRow = configp->Get(L"onlyUpToRow");
m_onlyUpToT = configp->Get(L"onlyUpToT");
@ -75,7 +75,31 @@ template <class ElemType>
auto result = ValueTensorFor(rank, fr);
auto input = Input(0)->ValueTensorFor(rank, fr);
result.AssignCopyOf(input);
// log the content
// do the tracing
Log(fr, false/*means log value*/);
}
template <class ElemType>
/*virtual*/ void TraceNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
{
assert(inputIndex == 0); inputIndex;
size_t rank = DetermineElementwiseTensorRank();
auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one...
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
sliceInputGrad.AddCopyOf(sliceOutputGrad);
// do the tracing
if (m_logGradientToo)
Log(fr, true/*means log gradient*/);
}
// log value or gradient
template <class ElemType>
/*virtual*/ void TraceNode<ElemType>::Log(const FrameRange& fr, bool logGradientInstead) const
{
if (m_numMBsRun == 1)
{
const auto prologue = m_formattingOptions.Processed(NodeName(), m_formattingOptions.prologue, m_numMBsRun);
@ -94,30 +118,18 @@ template <class ElemType>
let timeRange = fr.GetTimeRange();
fprintf(stderr, "------- Trace["); // --- for better visual separability from actual content
if (fr.IsAllFrames())
fprintf(stderr, "*");
else if (timeRange.second == timeRange.first+1)
fprintf(stderr, "%d", (int)timeRange.first);
;
else if (timeRange.second == timeRange.first + 1)
fprintf(stderr, "%d", (int)timeRange.first);
else if (timeRange.second > timeRange.first + 1)
fprintf(stderr, "%d..%d", (int)timeRange.first, (int)timeRange.second-1);
fprintf(stderr, "] %ls --> %s\n", m_message.c_str(), Input(0)->FormatOperationPrototype("").c_str());
fprintf(stderr, "] %ls %s--> %s\n", m_message.c_str(), logGradientInstead ? "(gradient) " : "", Input(0)->FormatOperationPrototype("").c_str());
Input(0)->WriteMinibatchWithFormatting(stderr, fr, m_onlyUpToRow, m_onlyUpToT, m_formattingOptions.transpose, m_formattingOptions.isCategoryLabel, m_formattingOptions.isSparse, m_labelMapping,
sequenceSeparator, sequencePrologue, sequenceEpilogue, elementSeparator, sampleSeparator,
valueFormatString, /*outputGradient=*/false);
valueFormatString, logGradientInstead);
}
}
template <class ElemType>
/*virtual*/ void TraceNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
{
assert(inputIndex == 0); inputIndex;
size_t rank = DetermineElementwiseTensorRank();
auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one...
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
sliceInputGrad.AddCopyOf(sliceOutputGrad);
}
template <class ElemType>
/*virtual*/ void TraceNode<ElemType>::Validate(bool isFinalValidationPass) // override
{

Просмотреть файл

@ -47,6 +47,9 @@ public:
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
private:
void Log(const FrameRange& fr, bool logGradientInstead) const;
private:
// configuration
std::wstring m_message;

Просмотреть файл

@ -1534,8 +1534,8 @@ template class DropoutNode<float>;
template class DropoutNode<double>;
// -----------------------------------------------------------------------
// BatchNormalizationNode (input, scale, bias, runMean, runInvStdDev, spatial,
// normalizationTimeConstant = 0, blendTimeConstant = 0,
// BatchNormalizationNode (input, scale, bias, runMean, runInvStdDev,
// spatial, normalizationTimeConstant = 0, blendTimeConstant = 0,
// epsilon = 0.00001,
// useCntkEngine = true, imageLayout = 'cudnn')
//
@ -1553,51 +1553,48 @@ template class DropoutNode<double>;
// where gamma and beta are trainable parameters(represented as LearnableParameter).
//
// * input is the input of the batch normalization node
// * scale is a LearnableParameter that stores scale vector(gamma term in the equation above).
// * bias is a LearnableParameter that stores bias vector(beta term). scale and bias must have the same dimensions which must be equal
// * scale is a LearnableParameter that stores scale vector (gamma term in the equation above).
// * bias is a LearnableParameter that stores bias vector (beta term). scale and bias must have the same dimensions which must be equal
// to the input dimensions in case of spatial = false or number of output convolution feature maps in case of spatial = true.
// * runMean is the running mean which is used during evaluation phase and might be used during training as well.
// It is represented as a LearnableParameter with the same dimensions as scale and bias.
// * runInvStdDev is the running inverse square root of variance(so InvStdDev = 1 / sqrt(var + epsilon)).
// It is represented as a LearnableParameter with the same dimensions as scale and bias.
// * spatial is a flag that specifies whether to compute mean / var for each feature in a mininbatch independently or, in case of convolutional layers, per feature map.
// TODO: This must be configured in a generic fashion where tensor axes are chosen along which parameters are tied.
// * normalizationTimeConstant is the time constant which is used to compute running average of mean and variance.
// Value 0 (default) means there will be no exponential smoothing and running mean / variance will always have values computed for the last seen mininbatch.
// Value 1#INF (infinity)means running values are "frozen" (i.e.will not be updated).
// Value 0 (default) means there will be no exponential smoothing and running mean/variance will always have values computed for the last seen mininbatch.
// Value 1#INF (infinity) means running values are "frozen" (i.e.will not be updated).
// * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch.
// Value 0 (default) means no blending will happen and only the current minibatch statistics will be used.
// Value 1#INF (infinity)means only running mean / var will be used(this is used, for example, in evaluation phase).
// Value 1#INF (infinity) means only running mean / var will be used(this is used, for example, in evaluation phase).
// * epsilon is a conditioner constant used in computing InvStdDev
// * useCntkEngine is a boolean flag that specifies which batch normalization implementation to use : CNTK or cuDNN - based.
// * imageLayout is the image layout.Only cudnn is supported.
// * useCntkEngine is a boolean flag that specifies which batch normalization implementation to use : CNTK or cuDNN-based.
// * imageLayout is the image layout. Only cudnn is supported at present.
// -----------------------------------------------------------------------
template <class ElemType>
class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInputs<5>
class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<5>, public IFreezable
{
typedef ComputationNode<ElemType> Base;
UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName()
{
return L"BatchNormalization";
}
typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName() { return L"BatchNormalization"; }
public:
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name) :
Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
m_mbCount(0), m_imageLayoutKind(ImageLayoutKind::CHW)
{
}
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant,
double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind)
: Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind) :
Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
{
}
BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp)
: BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
configp->Get(L"normalizationTimeConstant"), configp->Get(L"blendTimeConstant"),
configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
ImageLayoutKindFrom(configp->Get(L"imageLayout")))
BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
configp->Get(L"normalizationTimeConstant"), configp->Get(L"blendTimeConstant"),
configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
ImageLayoutKindFrom(configp->Get(L"imageLayout")))
{
AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
}
@ -1689,46 +1686,110 @@ public:
}
}
void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
private: // time-constant conversions
// map time constants to exp avg factor
// This is the factor for the current MB's estimate (1-factor is used for the previous value of the running stats).
double ComputeExpAvgFactor() const
{
// in inference mode, only use long-term mean and do not update running estimates
if (!Environment().IsTraining())
return 0; // (m_normTimeConst == infinity) no new contribution from current minibatch
// REVIEW alexeyk: hack, m_normTimeConst < 0 is used to denote corpus-level statistics (without forgetting factor).
if (m_normTimeConst < 0)
return 1.0 / (1.0 + m_mbCount); // (this is the hack case) TODO: verify this formula; shouldn't we use #samples instead of MB count?
// Convert to per-minibatch factor. The limit, positivie infinity, means that running mean/var parameters are "frozen"
// that is, do not require updates.
// The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
double numSamples = (double)GetMBLayout()->GetActualNumSamples();
if (!isfinite(m_normTimeConst)) // infinite
return 0; // no new contribution from current minibatch (infinitely long memory)
else if (m_normTimeConst > 0) // not zero
return 1.0 - exp(-numSamples / m_normTimeConst); // interpolate expAvgFactor * MB stats + (1-expAvgFactor) * prev running stats
else // zero
return 1.0; // don't use running stats at all
}
// map sample count to blend factor
// This is the interpolation weight for the running statistics (the current MB statistics are weighted with 1-this).
double ComputeBlendFactor() const
{
// in inference mode, only use long-term mean and do not update running estimates
if (!Environment().IsTraining())
return 1.0; // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate
// convert to blend factor (= weight for running stats)
// The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
double numSamples = (double)GetMBLayout()->GetActualNumSamples();
if (!isfinite(m_blendTimeConst)) // infinite weight for prior stats
return 1.0; // only use running statistics
else if (m_blendTimeConst > 0) // not zero
return m_blendTimeConst / (m_blendTimeConst + numSamples); // interpolate blendFactor * running stats + (1-blendFactor) * MB stats
else // zero
return 0; // no weight for prior stats, only use MB stats
}
public:
// Note: This function assumes that inputIndex=0 is called before the others.
// BUGBUG: The node should not make assumptions in which order the inputs' derivates are computed. It currently assumes to start with 0.
// BUGBUG: If the input has no learnables (e.g. using BN instead of corpus mean/var norm), this will not be called for inputIndex=0 at all.
virtual void BackpropToNonLooping(size_t inputIndex) override
{
FrameRange fr(Input(0)->GetMBLayout());
if (inputIndex == 0) // derivative with respect to the input.
{
auto sliceOutputGrad = GradientFor(fr);
auto sliceInputValue = Input(0)->ValueFor(fr);
const Matrix<ElemType>& scale = Input(1)->Value();
const Matrix<ElemType>& bias = Input(2)->Value();
const Matrix<ElemType>& runMean = Input(3)->Value();
const Matrix<ElemType>& runInvStdDev = Input(4)->Value();
auto sliceInputGrad = Input(0)->GradientFor(fr);
m_dScale->Resize(scale);
// The mean used in Forward() are either saveMean or runMean.
// This is decided by the engine, which communicates back the decision by returning
// an empty saveMean in case runMean should be used. Likewise for stddev.
let& actualMean = !m_saveMean->IsEmpty() ? *m_saveMean : runMean; // empty if only the running mean is used
let& actualInvStdDev = !m_saveInvStdDev->IsEmpty() ? *m_saveInvStdDev : runInvStdDev;
m_dScale->Resize(scale); // gradients for scale and bias get stored here
m_dBias->Resize(bias);
double blendFactor = ComputeBlendFactor(); // interpolation weight for the running statistics (the current MB statistics are weighted with 1-this)
// Compute all derivatives in one step. Save derivatives with respect to scale and bias in temp matrices.
m_bnEng->Backward(sliceInputValue, sliceOutputGrad, sliceInputGrad, scale,
*m_saveMean, *m_saveInvStdDev, *m_dScale, *m_dBias);
m_bnEng->Backward(sliceInputValue, sliceOutputGrad, // (in) input from below, gradient from above
sliceInputGrad, // (out) gradient for data input goes here
scale, // (in) out of scale and bias, only scale is needed in gradient propagation
blendFactor, // (in) smoothing weight for running stats (1=use only running stats)
actualMean, actualInvStdDev, // (in) actual mean/stddev values used in ForwardProp()
*m_dScale, *m_dBias); // (out) gradients for scale and bias
}
else if (inputIndex == 1) // derivative with respect to the scale
{
// Derivative with respect to the scale was precomputed during input derivative computation.
Matrix<ElemType>& grad = Input(1)->Gradient();
grad.SetValue(grad.GetNumRows(), grad.GetNumCols(), grad.GetDeviceId(), m_dScale->Data());
// BUGBUG: ^^ This should add the gradient, not overwrite it.
}
else if (inputIndex == 2) // derivative with respect to the bias
{
// Derivative with respect to the bias was precomputed during input derivative computation.
Matrix<ElemType>& grad = Input(2)->Gradient();
grad.SetValue(grad.GetNumRows(), grad.GetNumCols(), grad.GetDeviceId(), m_dBias->Data());
// BUGBUG: ^^ Also here, this should add the gradient, not overwrite it.
}
// No derivatives with respect to running mean and InvStdDev.
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The BatchNormalizationNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
void ForwardProp(const FrameRange& fr) override
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
{
FrameRange fr(Input(0)->GetMBLayout());
Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
const Matrix<ElemType>& scale = Input(1)->Value();
@ -1744,42 +1805,16 @@ public:
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
double expAvgFactor;
double blendFactor;
if (!Environment().IsTraining())
{
expAvgFactor = 0;
blendFactor = 1.0;
// determine the factors from the time constants
double expAvgFactor = ComputeExpAvgFactor(); // weight for the new MB statistics in the running estimate. The previous value of the running statistics is kept with weight (1-this)
double blendFactor = ComputeBlendFactor(); // interpolation weight for the running statistics (the current MB statistics are weighted with 1-this)
m_saveMean->Resize(0, 0);
m_saveInvStdDev->Resize(0, 0);
}
else
{
double numSamples = (double)GetMBLayout()->GetActualNumSamples();
if (m_normTimeConst > 0)
{
// Convert to per-minibatch factor. Treat positivie infinity as if running mean/var parameters are "frozen"
// that is, do not require updates.
expAvgFactor = !isfinite(m_normTimeConst) ? 0 : (1.0 - exp(-numSamples / m_normTimeConst));
}
else
{
// REVIEW alexeyk: hack, m_normTimeConst < 0 is used to compute CMA.
expAvgFactor = (m_normTimeConst < 0) ? (1.0 / (1.0 + m_mbCount)) : 1.0;
}
if (!isfinite(m_blendTimeConst))
blendFactor = 1.0;
else
blendFactor = m_blendTimeConst > 0 ? (m_blendTimeConst / (m_blendTimeConst + numSamples)) : 0;
m_saveMean->Resize(runMean);
m_saveInvStdDev->Resize(runMean);
}
m_bnEng->Forward(sliceInputValue, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev,
sliceOutputValue, m_epsilon, *m_saveMean, *m_saveInvStdDev);
m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias, // (in)
expAvgFactor, blendFactor,
runMean, runInvStdDev, // (in/out) running estimates, updated from the current MB mean/stddev
/*out=*/ sliceOutputValue, // (out) batch-normalized output value
m_epsilon,
*m_saveMean, *m_saveInvStdDev); // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine
m_mbCount++;
}
@ -1820,25 +1855,25 @@ public:
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
{
Base::RequestMatricesBeforeForwardProp(matrixPool);
RequestMatrixFromPool(m_saveMean, matrixPool);
RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
}
RequestMatrixFromPool(m_saveMean, matrixPool);
RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
}
void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
{
Base::RequestMatricesBeforeBackprop(matrixPool);
RequestMatrixFromPool(m_dScale, matrixPool);
RequestMatrixFromPool(m_dBias, matrixPool);
}
RequestMatrixFromPool(m_dScale, matrixPool);
RequestMatrixFromPool(m_dBias, matrixPool);
}
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_saveMean, matrixPool);
ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
ReleaseMatrixToPool(m_dScale, matrixPool);
ReleaseMatrixToPool(m_dBias, matrixPool);
}
ReleaseMatrixToPool(m_saveMean, matrixPool);
ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
ReleaseMatrixToPool(m_dScale, matrixPool);
ReleaseMatrixToPool(m_dBias, matrixPool);
}
void SetNormalizationTimeConstants(double normalizationTimeConstant, double prevNormalizationTimeConstant,
double blendTimeConstant, double prevBlendTimeConstant)
@ -1851,6 +1886,20 @@ public:
m_blendTimeConst = blendTimeConstant;
}
// called from CloneFunction(..., parameters="constant")
// Once called, this node is put into inference mode.
virtual void FreezeParameters() override // from IFreezable
{
m_normTimeConst = std::numeric_limits<double>::infinity();
m_blendTimeConst = std::numeric_limits<double>::infinity();
}
double NormalizationTimeConstant() const { return m_normTimeConst; }
double BlendTimeConstant() const { return m_blendTimeConst; }
bool Spatial() const { return m_spatial; }
double Epsilon() const { return m_epsilon; }
bool UseCNTKEngine() const { return m_useCntkEngine; }
private:
// Old versioning - do not use. Do not remove until we're sure there are no old models around.
struct VersionInfo
@ -1865,36 +1914,51 @@ private:
VersionInfo m_version;
private:
// --- configuration parameters
// Determines whether to use per-activation (used after non-convolutional layers like fully connected)
// or spatial (used after convolutional layers).
// TODO: This should not be a config option, but rather inferred from dimensions of the Parameters.
bool m_spatial;
// Time constant for running mean and variance.
// Time constant for estimating the running mean and variance.
// This is the time constant of a low-pass filter.
// If 0, running mean and variance just remember the last minibatch.
// If infinity, running mean and variance are not updated, like in inference mode.
double m_normTimeConst;
// Time constant for blending running mean/var and current minibatch mean/var.
// The main idea is to represent current minibatch statistics as MAP estimate, linear interpolation
// of smoothed and minibatch statistics.
// Equivalent sample count for blending running mean/var and current minibatch mean/var.
// Roughly, this specifies how many samples "worth" is the running statistics,
// relative to the current minibatch statistics.
// If 0, only use the current MB statistics. If infinity, use only the running mean, like in inference mode.
// The main idea is to estimate the mean/variance as a MAP estimate using the running mean/var as a prrior.
// This should make the method more robust to the case of very small minibatches,
// and also provides a meaningful interpretation of inference mode, where only the prior is used.
// Effectively, this ends up in a linear interpolation of running and minibatch statistics.
// The idea is due to Frank Seide et al.
// It should also work well in data parallelism scenario
// as opposed to plain vanilla BN implementation which would require aggregation of statistics
// from all nodes.
// It should also work well in data parallelism scenario, as opposed to plain vanilla BN implementation
// which would require aggregation of statistics from all nodes.
// REVIEW alexeyk: if this works, document it properly in Wiki.
double m_blendTimeConst;
// Epsilon used to compute inverse std deviation.
double m_epsilon;
// Whether to use CNTK or cuDNN BN implementation.
bool m_useCntkEngine;
// Layout (e.g. CHW).
ImageLayoutKind m_imageLayoutKind;
// --- working variables
// Minibatch count, used to compute cumulative moving average.
size_t m_mbCount;
// Stores pre-computed on forward pass mean values that are used in gradient computation.
// Interpolated actual mean/stddev values. Pre-computed on forward pass, also used in gradient computation.
shared_ptr<Matrix<ElemType>> m_saveMean;
// Stores pre-computed on forward pass InvStdDev values that are used in gradient computation.
shared_ptr<Matrix<ElemType>> m_saveInvStdDev;
// Stores scale derivatives
// Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls.
// Not used for blendFactor=1 in CNTK engine.
shared_ptr<Matrix<ElemType>> m_dScale;
// Stores bias derivatives.
shared_ptr<Matrix<ElemType>> m_dBias;
std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;

Просмотреть файл

@ -321,15 +321,17 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
RuntimeError("Expected %d outputs, but got %d.", (int)m_outputNodes.size(), (int)outputs.size());
size_t i = 0;
for (auto& input : m_inputMatrices)
for (auto& inputNode : m_inputNodes)
{
// const cast: The matrix class takes this over without copying and could theoretically change the contents,
// though it doesn't in this case.
auto& buffer = const_cast<ValueBuffer<ElemType, ValueContainer>&>(inputs[i]);
shared_ptr<Matrix<ElemType>> matrix = dynamic_pointer_cast<Matrix<ElemType>>(input.second.matrix);
auto matrix = dynamic_pointer_cast<Matrix<ElemType>>(inputNode->ValuePtr());
auto type = matrix->GetMatrixType();
size_t numRows = input.second.sampleLayout.GetNumElements();
size_t numRows = inputNode->GetSampleLayout().GetNumElements();
if (buffer.m_buffer.data() == nullptr)
RuntimeError("Input %ls: Buffer is not allocated.", m_inputNodes[i]->GetName().c_str());
if (type == MatrixType::DENSE)
{
if (buffer.m_buffer.size() % numRows != 0)
@ -340,8 +342,12 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
}
else if (type == MatrixType::SPARSE)
{
if (buffer.m_colIndices.data() == nullptr)
RuntimeError("Input %ls: Due to sparse input format, expected colIndices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
if (buffer.m_indices.data() == nullptr)
RuntimeError("Input %ls: Due to sparse input format, expected Indices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
if (buffer.m_colIndices.size() < 2)
RuntimeError("Input %ls: Expected at least one element.", m_inputNodes[i]->GetName().c_str());
RuntimeError("Input %ls: Expected at least one element (2 entries in colIndices array).", m_inputNodes[i]->GetName().c_str());
if (buffer.m_colIndices[0] != 0)
RuntimeError("Input %ls: First element of column indices must be 0", m_inputNodes[i]->GetName().c_str());
if (buffer.m_colIndices[buffer.m_colIndices.size() - 1] != buffer.m_indices.size())
@ -352,8 +358,8 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
int numCols = type == MatrixType::DENSE ? buffer.m_buffer.size() / numRows : buffer.m_colIndices.size() - 1;
assert(numCols >= 1);
input.second.pMBLayout->Init(1, numCols);
input.second.pMBLayout->AddSequence(0, 0, 0, numCols);
inputNode->GetMBLayout()->Init(1, numCols);
inputNode->GetMBLayout()->AddSequence(0, 0, 0, numCols);
if (type == MatrixType::DENSE)
matrix->SetValue(numRows, numCols, matrix->GetDeviceId(), buffer.m_buffer.data(), matrixFlagNormal);

Просмотреть файл

@ -14,6 +14,11 @@
#include <msclr\marshal_cppstd.h>
#include "CNTKException.h"
#pragma warning(push)
#pragma warning(disable : 4793) // Function compiled as native
#include "Basics.h"
#include "ScriptableObjects.h"
#pragma warning(pop)
#include "EvalCommon.h"
#include "Eval.h"
@ -250,7 +255,14 @@ public:
outputNodeNames.push_back(context.marshal_as<std::wstring>(output));
}
m_eval->StartForwardEvaluation(outputNodeNames);
try
{
m_eval->StartForwardEvaluation(outputNodeNames);
}
catch (const exception& ex)
{
throw GetCustomException(ex);
}
}
//
@ -354,6 +366,11 @@ private:
{
return gcnew CNTKBadAllocException(gcnew System::String(ex.what()));
}
else if (dynamic_cast<const ScriptableObjects::ScriptingException*>(&ex) != nullptr) // Includes derived classes
{
const auto& err = dynamic_cast<const ScriptableObjects::ScriptingException&>(ex);
return gcnew CNTKLogicErrorException(gcnew System::String(wstrprintf(L"%ls\n%ls", utf16(err.what()).c_str(), err.GetError(L"").c_str()).c_str()), nullptr);
}
else
{
return gcnew CNTKException(gcnew System::String(ex.what()));

Просмотреть файл

@ -56,6 +56,8 @@
</ClCompile>
<Link>
<AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
<AdditionalDependencies>EvalDLL.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<DelayLoadDLLs>EvalDll.dll</DelayLoadDLLs>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(DebugBuild)">
@ -66,10 +68,6 @@
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<DelayLoadDLLs>
</DelayLoadDLLs>
<AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -79,10 +77,6 @@
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<DelayLoadDLLs>
</DelayLoadDLLs>
<AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Release|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemGroup>

Просмотреть файл

@ -25,8 +25,6 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
assert(m_inOutT.GetNumElements() == bias.GetNumRows());
assert(m_inOutT.GetNumElements() == runMean.GetNumRows());
assert(m_inOutT.GetNumElements() == runInvStdDev.GetNumRows());
assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
}
else
{
@ -34,26 +32,35 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
assert((m_inOutT.GetNumElements() % bias.GetNumRows()) == 0);
assert((m_inOutT.GetNumElements() % runMean.GetNumRows()) == 0);
assert((m_inOutT.GetNumElements() % runInvStdDev.GetNumRows()) == 0);
assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
}
assert(scale.GetNumCols() == 1);
assert(bias.GetNumCols() == 1);
assert(runMean.GetNumCols() == 1);
assert(runInvStdDev.GetNumCols() == 1);
assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
EnsureCompatible();
ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
if (!m_spatial)
{
assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
}
else
{
assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
}
assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
}
template <class ElemType>
void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale,
void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor,
const Mat& saveMean, const Mat& saveInvStdDev, Mat& scaleGrad, Mat& biasGrad)
{
EnsureCompatible();
BackwardCore(in, srcGrad, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
BackwardCore(in, srcGrad, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
}
template <class ElemType>
@ -88,10 +95,10 @@ protected:
in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
}
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad) override
{
srcGrad.BatchNormalizationBackward(in, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
}
};
@ -128,4 +135,4 @@ std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEV
template class BatchNormEngine<float>;
template class BatchNormEngine<double>;
} } }
}}}

Просмотреть файл

@ -37,7 +37,7 @@ public:
void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad);
static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
@ -55,10 +55,11 @@ protected:
virtual void EnsureCompatible() = 0;
// saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched
virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad) = 0;
protected:
@ -70,4 +71,4 @@ protected:
#pragma warning(pop)
} } }
}}}

Просмотреть файл

@ -9,6 +9,7 @@
#include <emmintrin.h>
#include <tmmintrin.h>
#include <immintrin.h>
#include <smmintrin.h>
#include <assert.h>
#include <cstdint>
#include <iostream>

Просмотреть файл

@ -32,8 +32,10 @@
#include <vld.h>
#endif
#pragma warning(disable : 4100) // unreferenced formal parameter; "struct TensorOpReduction<ElemType, OPFN, typename ReductionOp, N, -1>" trigger this
#pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
#pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons
#pragma warning(disable : 4244) // unreachable code; triggered for unknown reasons
#pragma warning(disable : 4702) // conversion from 'double' to 'float'
#ifdef USE_ACML
// Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
@ -4418,13 +4420,16 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev, CPUMatrix<ElemType>& out, double epsilon,
CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const
{
UNUSED(epsilon); UNUSED(saveMean); UNUSED(saveInvStdDev);
UNUSED(epsilon);
assert((GetNumRows() % scale.GetNumRows()) == 0);
if (expAvgFactor != 0 || blendFactor != 1)
RuntimeError("Batch normalization training on CPU is not yet implemented.");
saveMean.Resize(0, 0); // only doing inference: these two are not produced
saveInvStdDev.Resize(0, 0);
bool spatial = GetNumRows() != scale.GetNumRows();
if (spatial)
{
@ -4453,10 +4458,11 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
}
template <class ElemType>
void CPUMatrix<ElemType>::BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
void CPUMatrix<ElemType>::BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor,
const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const
{
UNUSED(in); UNUSED(grad); UNUSED(scale); UNUSED(saveMean); UNUSED(saveInvStdDev); UNUSED(scaleGrad); UNUSED(biasGrad);
UNUSED(in); UNUSED(grad); UNUSED(scale); UNUSED(blendFactor), UNUSED(saveMean); UNUSED(saveInvStdDev); UNUSED(scaleGrad); UNUSED(biasGrad);
RuntimeError("Batch normalization training on CPU is not yet implemented.");
}
@ -6042,35 +6048,38 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
// perform loop over reduction index m
// This function is declared inside a wrapper struct to allow partial specialization (m = -1).
template <class ElemType, typename OPFN, size_t N, int m>
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int m>
struct TensorOpReduction
{
// reduction case (non-reduction case is specialized)
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn,
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& reductionOp,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
array<ptrdiff_t, N - 1> strides; // N-1 because last one is the result pointer, which is unused in reduction
for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled
strides[i] = reducingStrides[i][(size_t) m];
double /*ElemType*/ aggregate = 0;
for (size_t dim = reducingOpDims[(size_t) m]; dim-- > 0;)
double aggregate = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;)
{
// need to descend into one loop deeper
aggregate += TensorOpReduction<ElemType, OPFN, N, m - 1>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
// advance the pointers
for (size_t i = 0; i < N - 1; i++)
pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here
// need to descend into one loop deeper
aggregate = reductionOp(aggregate, TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides));
}
return (ElemType) aggregate;
// Actually it would be nicer to return double but we keep ElementType so that test don't return different numbers than previous implementation.
return static_cast<double>(aggregate);
}
};
// perform loop over reduction index m
// This is the specialized version for m = -1, which terminates the recursion.
template <class ElemType, typename OPFN, size_t N>
struct TensorOpReduction<ElemType, OPFN, N, -1>
template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
struct TensorOpReduction<ElemType, OPFN, ReductionOp, N, -1>
{
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn,
static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& reductionOp,
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&)
{
return opfn(pointers); // finally we are doing some work!!!
@ -6082,10 +6091,10 @@ struct TensorOpReduction<ElemType, OPFN, N, -1>
// -----------------------------------------------------------------------
// perform loop over regular index k and reducing index m for N operands (counting the output)
template <class ElemType, typename OPFN, size_t N, bool vectorizable, int m, int k>
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m, int k>
struct TensorOpIteration
{
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn,
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
@ -6096,7 +6105,7 @@ struct TensorOpIteration
for (size_t dim = regularOpDims[(size_t) k]; dim-- > 0;)
{
// need to descend into one loop deeper
TensorOpIteration<ElemType, OPFN, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// advance the pointers
for (size_t i = 0; i < N; i++)
pointers[i] += strides[i];
@ -6106,10 +6115,10 @@ struct TensorOpIteration
// Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE.
// This is a very common case, e.g. adding vectors or computing the Sigmoid.
template <class ElemType, typename OPFN>
struct TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
template <class ElemType, typename OPFN, typename ReductionOp>
struct TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
{
static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN& opfn,
static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
{
@ -6121,25 +6130,25 @@ struct TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduc
if (beta != 0)
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else if (alpha != 1)
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// TODO: According to Amit, the VS compiler is not able to vectorize into lambdas. Solution: change the lambda to take an N, or to implement the loop inside (with 1 element by default).
// TODO: The signedness of k (required for omp) causes an extra sign-extend.
// TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it?
}
};
// and unary
template <class ElemType, typename OPFN>
struct TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
template <class ElemType, typename OPFN, typename ReductionOp>
struct TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
{
static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN& opfn,
static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
@ -6150,27 +6159,27 @@ struct TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduc
if (beta != 0)
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else if (alpha != 1)
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else
#pragma omp parallel for
for (int k = 0; k < (int) K; k++)
TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
};
template <class ElemType, typename OPFN, size_t N, bool vectorizable, int m>
struct TensorOpIteration<ElemType, OPFN, N, vectorizable, m, -1>
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m>
struct TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, -1>
{
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn,
static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
// we are at element level for the result: perform the op (there may still be reduction)
ElemType val = TensorOpReduction<ElemType, OPFN, N, m>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
ElemType val = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
// scale
val *= alpha;
// combine with previous value in target matrix, then write it out
@ -6188,8 +6197,8 @@ struct TensorOpIteration<ElemType, OPFN, N, vectorizable, m, -1>
// -----------------------------------------------------------------------
// tensor operation with k+1 dimensions (-1 means scalar)
template <class ElemType, typename OPFN, size_t N, int k>
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, const OPFN& opfn,
template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int k>
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, const OPFN& opfn, ReductionOp reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
@ -6197,9 +6206,9 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
switch (dims)
{
case 2:
return TensorOpIteration<ElemType, OPFN, N, false /*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1:
return TensorOpIteration<ElemType, OPFN, N, false /*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0:
{
// if all leading dimensions are 1, we can let the compiler do some unrolling
@ -6207,9 +6216,9 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
for (size_t i = 0; i < N; i++)
leadingAllOne &= k >= 0 && regularStrides[i][0] == 1;
if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions
return TensorOpIteration<ElemType, OPFN, N, true /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, true /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
else
return TensorOpIteration<ElemType, OPFN, N, false /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
default:
LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int) dims);
@ -6218,11 +6227,11 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
// tensor operation, generalized in number of arguments, operation already provided as a lambda
// This function now expands into different k.
template <class ElemType, typename OPFN, size_t N>
static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn,
const array<size_t, N>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
static void TensorOpWithFnAndReduction(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
const array<size_t, N>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
pointers[i] += offsets[i];
@ -6230,17 +6239,50 @@ static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType
switch (dims)
{
case 4:
return TensorOpWithRegularLoop<ElemType, OPFN, N, 3>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 3>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 3:
return TensorOpWithRegularLoop<ElemType, OPFN, N, 2>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 2>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 2:
return TensorOpWithRegularLoop<ElemType, OPFN, N, 1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1:
return TensorOpWithRegularLoop<ElemType, OPFN, N, 0>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 0>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0:
return TensorOpWithRegularLoop<ElemType, OPFN, N, -1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, -1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
default:
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int) dims);
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims);
}
}
// tensor operation, generalized in number of arguments, operation already provided as a lambda
// This function now expands into different reductionOps
template <class ElemType, typename OPFN, size_t N>
static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, ElementWiseOperator reductionOp,
const array<size_t, N>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
// BUGBUG: Using always 'double' as type of aggregator even for ElemType==float. Reason: otherwise some e2e test would fail as historically we
// used double for aggregator of sum. But:
// * for min and max reductions this is meaningless.
// * It is not consitent with what we do on GPU, there we aggregate on ElemType.
// * It costs performance.
// TODO: apdapt e2e tests to run with aggregator of type ElemType.
#define CaseTensorOpWithFnAndReduction(oper) \
case ElementWiseOperator::op##oper: \
return TensorOpWithFnAndReduction(beta, pointers, alpha, opfn, [](double a, double b) \
{ \
return Op##oper(a, b); \
}, \
offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
switch (reductionOp)
{
CaseTensorOpWithFnAndReduction(Sum);
CaseTensorOpWithFnAndReduction(LogSum);
CaseTensorOpWithFnAndReduction(Min);
CaseTensorOpWithFnAndReduction(Max);
default:
LogicError("Specified ElementWiseOperator op %d not suported as reduction operation.", (int)reductionOp);
}
}
@ -6256,8 +6298,11 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum) // TODO: enable the reduction ops
InvalidArgument("TensorOp: Unary reduction operations other than opSum not yet implemented.");
if (reductionOp != ElementWiseOperator::opSum &&
reductionOp != ElementWiseOperator::opLogSum &&
reductionOp != ElementWiseOperator::opMin &&
reductionOp != ElementWiseOperator::opMax)
InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
// TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
#define CaseUnaryTensorOp(oper) \
@ -6266,7 +6311,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
{ \
return Op##oper((*(pp[0]))); \
}, \
offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
array<ElemType*, 2> pointers = {a.Data(), Data()};
switch (op)
@ -6294,7 +6339,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
{ \
return Op##oper((*(pp[0])), (*(pp[1]))); \
}, \
offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
array<ElemType*, 3> pointers = {a.Data(), b.Data(), Data()};
switch (op)
@ -6322,7 +6367,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
{ \
return Op##oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); \
}, \
offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
array<ElemType*, 4> pointers = {a.Data(), b.Data(), c.Data(), Data()};
switch (op)
@ -6359,11 +6404,33 @@ template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
template void CPUMatrix<char>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly);
template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
template char* CPUMatrix<char>::CopyToArray(void) const;
template void CPUMatrix<char>::CopySection(size_t numRows, size_t numCols, char* dst, size_t colStride) const;
template void CPUMatrix<char>::Reshape(const size_t, const size_t);
// Support <short>
template CPUMatrix<short>::CPUMatrix(const size_t numRows, const size_t numCols);
template CPUMatrix<short>::CPUMatrix(const size_t numRows, const size_t numCols, short* pArray, const size_t matrixFlags);
template CPUMatrix<short>::CPUMatrix();
template CPUMatrix<short>::CPUMatrix(CPUMatrix<short> const&);
template CPUMatrix<short>::CPUMatrix(CPUMatrix<short>&&);
template size_t CPUMatrix<short>::LocateElement(size_t, size_t) const;
template CPUMatrix<short>::~CPUMatrix();
template CPUMatrix<short> CPUMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
template CPUMatrix<short>& CPUMatrix<short>::operator=(CPUMatrix<short>&&);
template void CPUMatrix<short>::SetValue(const short);
template void CPUMatrix<short>::SetValue(const size_t numRows, const size_t numCols, short* pArray, size_t matrixFlags);
template void CPUMatrix<short>::SetValue(CPUMatrix<short> const&);
//template void CPUMatrix<short>::SetValue(GPUMatrix<short> const&);
//template void CPUMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
//template void CPUMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
template void CPUMatrix<short>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly);
template void CPUMatrix<short>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
template short* CPUMatrix<short>::CopyToArray(void) const;
template void CPUMatrix<short>::CopySection(size_t numRows, size_t numCols, short* dst, size_t colStride) const;
template void CPUMatrix<short>::Reshape(const size_t, const size_t);
template CPUMatrix<int>::CPUMatrix(const size_t, const size_t, int*, const size_t);
template CPUMatrix<int>::~CPUMatrix();
}}}

Просмотреть файл

@ -375,7 +375,7 @@ public:
void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev,
CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
public:

Просмотреть файл

@ -781,6 +781,7 @@ void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPU
}
}
// TODO: Implement CSR as a transposition of b, like we do for GPU.
if (rhs.GetFormat() != matrixFormatSparseCSC)
NOT_IMPLEMENTED;
@ -820,13 +821,42 @@ void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPU
}
}
}
// the transposeA case is copy-paste from above with rows/cols of lhs swapped
else if (transposeA && !transposeB)
{
NOT_IMPLEMENTED;
for (size_t j = 0; j < rhs.GetNumCols(); j++)
{
size_t start = rhs.SecondaryIndexLocation()[j]; // ColLocation
size_t end = rhs.SecondaryIndexLocation()[j + 1];
for (size_t p = start; p < end; p++)
{
size_t i = rhs.MajorIndexLocation()[p]; // RowLocation
ElemType val = rhs.Buffer()[p];
for (size_t h = 0; h < lhs.GetNumCols(); h++)
{
c(h, j) += alpha * lhs(i, h) * val;
}
}
}
}
else
else if (transposeA && transposeB)
{
NOT_IMPLEMENTED;
for (size_t j = 0; j < rhs.GetNumCols(); j++)
{
size_t start = rhs.SecondaryIndexLocation()[j];
size_t end = rhs.SecondaryIndexLocation()[j + 1];
for (size_t p = start; p < end; p++)
{
size_t i = rhs.MajorIndexLocation()[p];
ElemType val = rhs.Buffer()[p];
for (size_t h = 0; h < lhs.GetNumCols(); h++)
{
c(h, i) += alpha * lhs(j, h) * val;
}
}
}
}
}
@ -1475,6 +1505,29 @@ template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t st
template void CPUSparseMatrix<char>::AssignColumnSliceToDense(CPUMatrix<char>&, size_t startColumn, size_t numCols) const;
template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);
// Support <short>
template CPUSparseMatrix<short>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
template CPUSparseMatrix<short>::CPUSparseMatrix(MatrixFormat);
template CPUSparseMatrix<short>::CPUSparseMatrix(CPUSparseMatrix<short> const&);
template CPUSparseMatrix<short>::CPUSparseMatrix(CPUSparseMatrix<short>&&);
template CPUSparseMatrix<short>& CPUSparseMatrix<short>::operator=(CPUSparseMatrix<short>&& moveFrom);
template void CPUSparseMatrix<short>::SetValue(size_t, size_t, short);
//template void CPUSparseMatrix<short>::SetValue(CPUMatrix<short> const&);
//template void CPUSparseMatrix<short>::SetValue(GPUMatrix<short> const&);
template void CPUSparseMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
//template void CPUSparseMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
template short* CPUSparseMatrix<short>::Data() const;
template short* CPUSparseMatrix<short>::Data();
template void CPUSparseMatrix<short>::Reset(void);
template void CPUSparseMatrix<short>::Resize(const size_t, const size_t, const size_t, const bool);
template void CPUSparseMatrix<short>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, bool);
template void CPUSparseMatrix<short>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const MatrixFormat, const bool, bool);
template CPUSparseMatrix<short>::~CPUSparseMatrix();
template CPUSparseMatrix<short> CPUSparseMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
template CPUMatrix<short> CPUSparseMatrix<short>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
template void CPUSparseMatrix<short>::AssignColumnSliceToDense(CPUMatrix<short>&, size_t startColumn, size_t numCols) const;
template CPUSparseMatrix<short>& CPUSparseMatrix<short>::operator=(const CPUSparseMatrix<short>& deepCopyFrom);
template CPUSparseMatrix<int>::CPUSparseMatrix(const MatrixFormat, const size_t, const size_t, const size_t);
template CPUSparseMatrix<int>::~CPUSparseMatrix();

Просмотреть файл

@ -110,11 +110,11 @@ __device__ __forceinline__ T Shuffle(T input, int srcLane)
{
#ifdef __CUDA_ARCH__
// shfl is supported only on Kepler+
static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer");
static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer.");
return cub::ShuffleIndex(input, srcLane);
#else
assert(false);
return input;
return input; // keep compiler happy
#endif
}
@ -163,8 +163,12 @@ void Call(size_t vectorSize, Targs... args)
// As a result, each block has 2 * blockDim.x (mean and inverse stddev) values to write at the end.
//
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
double epsilon, ElemType* xMean, ElemType* xInvStdDev)
__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
const ElemType* x, // (in) input data
double expAvgFactor,
ElemType* runMean, ElemType* runInvStdDev, // (in/out) running mean/stddev, gets updated with current minibatch
double epsilon,
ElemType* xMean, ElemType* xInvStdDev) // (out) this minibatch's mean
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@ -182,9 +186,12 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
return;
assert(irowSrcBase + U <= vectorSize);
// --- estimate this minibatch's mean/stddev
// first estimate mean over all data for this thread
int n = 0;
ElemType mean[U];
ElemType m2[U];
ElemType mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
ElemType m2[U]; // likewise for stdev
#pragma unroll
for (int k = 0; k < U; k++)
{
@ -207,12 +214,13 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
ElemType d = curVal[k] - mean[k];
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
// Using precise math slows down the code by about 40%.
mean[k] += d / n;
mean[k] += d / n; // mean_n = [mean_{n-1} * (n-1) + curVal] / n = mean_{n-1} *n/n - mean_{n-1} / n + curVal / n
m2[k] += d * (curVal[k] - mean[k]);
}
psrc += vectorSize * BlockDimY;
}
// now reduce minibatch mean/stddev across threads
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
const int laneId = tid & 0x1f;
// First, reduce within warp using shuffle.
@ -259,6 +267,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
}
__syncthreads();
// --- final reduction and update of running mean/stddev
// Accumulate and write final results.
// REVIEW alexeyk: see if atomicAdd can be used instead, do perf comparison.
if (threadIdx.y == 0)
@ -283,7 +293,10 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
// Store mean and running mean.
StoreValues<U>(mean, xMean + idxDstBase);
if (expAvgFactor == 1)
// at this point, minibatch mean has been saved into xMean[]
// accumulate running mean
if (expAvgFactor == 1) // 100% comes from current minibatch, nothing from history
StoreValues<U>(mean, runMean + idxDstBase);
else
{
@ -294,6 +307,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
run[k] = expAvgFactor * mean[k] + (1.0 - expAvgFactor) * run[k];
StoreValues<U>(run, runMean + idxDstBase);
}
// at this point, runMean[] has been updated
// Store inv std dev and its running version.
#pragma unroll
for (int k = 0; k < U; k++)
@ -301,6 +316,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
m2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
}
StoreValues<U>(m2, xInvStdDev + idxDstBase);
// at this point, minibatch stddev has been saved into xInvStdDev[]
if (expAvgFactor == 1)
StoreValues<U>(m2, runInvStdDev + idxDstBase);
else
@ -312,6 +329,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
run[k] = expAvgFactor * m2[k] + (1.0 - expAvgFactor) * run[k];
StoreValues<U>(run, runInvStdDev + idxDstBase);
}
// at this point, runInvStdDev[] has been updated
}
}
@ -467,8 +485,13 @@ template <int U>
struct ComputeBatchMeanAndInvStdDev
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
static void Call(size_t vectorSize, size_t batchSize,
const ElemType* x, // (in) input data
double expAvgFactor,
ElemType* runMean, ElemType* runInvStdDev, // (in/out) running mean/stddev, gets updated with current minibatch
double epsilon,
ElemType* xMean, ElemType* xInvStdDev, // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
cudaStream_t stream)
{
assert((vectorSize % U) == 0);
@ -594,8 +617,11 @@ template <int U>
struct NormalizeBatchTraining
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, ElemType* y,
const ElemType* bnScale, const ElemType* bnBias, const ElemType* batchMean, const ElemType* batchInvStdDev, cudaStream_t stream)
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial,
const ElemType* x, ElemType* y, // (in, out) data to normalize -> normalized data
const ElemType* bnScale, const ElemType* bnBias, // (in) scale/bias to denormalize with
const ElemType* batchMean, const ElemType* batchInvStdDev, // (in) actual mean/stddev to normalize with
cudaStream_t stream)
{
assert((vectorSize % U) == 0);
@ -839,7 +865,7 @@ struct ComputeSpatialScaleAndBiasGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
{
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
@ -854,9 +880,10 @@ struct ComputeSpatialScaleAndBiasGradients
}
};
// mbStatsWeight is the weight with which current MB's stats were used (0 means not at all, locked model).
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
__global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias,
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
const ElemType* saveMean, const ElemType* saveInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
@ -917,18 +944,29 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
LoadValues<U>(pdy, dyCur);
LoadValues<U>(pdx, dxCur);
// From the BN paper, dL/dxi is a sum of three terms: dL/dxi = t1 + t2 + t3
// After simplifcation, they become the following:
// 1. t1 = scale * dL/dyi * invStdDev
// 2. t2 = (-scale / m) * invStdDev * xHat * dL/dScale
// 3. t3 = (-scale / m) * invStdDev * dL/dBias (for this one note that Sum(xHat) == 0)
// The formulas for dBias and dScale happen to occur as subexpressions in this gradient as well.
// Leveraging this, this gradient can be simplified to:
// t1 = scale * dL/dyi * invStdDev
// t2 = mbStatsWeight * (-scale / m) * invStdDev * xHat * dL/dScale
// t3 = mbStatsWeight * (-scale / m) * invStdDev * dL/dBias (for this one note that Sum(xHat) == 0)
// with
// dBias = Reduce(dy)
// dScale = Reduce(dy * xHat)
// Simplifying this a bit more, we get the formula below.
ElemType val[U];
int m = Spatial ? batchSize * spatialSize : batchSize;
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k];
val[k] = dxCur[k] + (scale[k] * invStdDev[k]) * (dyCur[k] - (xNorm * ds[k] + db[k]) / m);
ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k]; // xHat
// scale * invStdDev * (
// dL/dyi
// - mbStatsWeight * (xHat * dL/dScale + dL/dBias) / m
// )
val[k] = dxCur[k] // (adding to gradient)
+ (scale[k] * invStdDev[k]) * (
dyCur[k]
- mbStatsWeight * (xNorm * ds[k] + db[k]) / m);
}
StoreValues<U>(val, pdx);
}
@ -939,25 +977,26 @@ struct BackpropagateBatchNormGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)),
static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
if (spatial)
{
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
}
else
{
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false/*not spatial*/, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
}
}
};
} } }
}}}

Просмотреть файл

@ -96,7 +96,8 @@ enum ElementWiseOperator
opCond /*a ? b : c*/,
opClip, /*clip a within interval b..c*/
opElementwiseProductWithLogSumDerivative,
opCopyIfEqual
opCopyIfEqual,
opElementwiseProductWithExpOfDiff, /* a * exp(b - c) */
// Note: not all that's implemented in CNTK ComputationNodes has an opcode yet.
};
@ -157,7 +158,8 @@ enum ElementWiseOperator
Macro(Cond); \
Macro(CopyIfEqual); \
Macro(Clip); \
Macro(ElementwiseProductWithLogSumDerivative);
Macro(ElementwiseProductWithLogSumDerivative); \
Macro(ElementwiseProductWithExpOfDiff);
// -----------------------------------------------------------------------
// various enums to describe

Просмотреть файл

@ -53,32 +53,37 @@ protected:
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
// cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
epsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
// expAvgFactor == 0 && blendFactor == 1 means we are in eval mode.
// expAvgFactor == 0 && blendFactor == 1 means we are in inference mode.
if (expAvgFactor == 0 && blendFactor == 1)
{
saveMean.Resize(0, 0); // (these are not produced in this case)
saveInvStdDev.Resize(0, 0);
CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out),
m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), epsilon));
m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), epsilon));
}
else
{
saveMean.Resize(runMean);
saveInvStdDev.Resize(runMean);
CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in),
m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
epsilon, ptr(saveMean), ptr(saveInvStdDev)));
m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
epsilon, ptr(saveMean), ptr(saveInvStdDev)));
}
}
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad) override
{
UNUSED(blendFactor); // BUGBUG: It should be used.
m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
// REVIEW alexeyk: remove once Philly is upgraded to prod version. Also change betaParamDiff to 1 and update CNTK BN engine.
#if CUDNN_MAJOR >= 5 || (CUDNN_MAJOR == 4 && CUDNN_PATCHLEVEL >= 7)
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
#else
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
#endif
}

Просмотреть файл

@ -116,6 +116,44 @@ const char* CudaErrString<curandStatus>(curandStatus)
namespace Microsoft { namespace MSR { namespace CNTK {
/*static*/ bool SyncGuard::s_isSyncEnabled = false;
/*static*/ void SyncGuard::EnableSync()
{
s_isSyncEnabled = true;
}
SyncGuard::SyncGuard(bool forceSync /*= false*/)
: m_forceSync(forceSync)
{
m_done = nullptr;
if (m_forceSync || s_isSyncEnabled)
{
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaEventCreate(&m_done));
}
}
SyncGuard::~SyncGuard()
{
if (m_forceSync || s_isSyncEnabled)
{
// The regular use of this destructor is to synchronize the GPU, but also
// to check for errors. So this destructor is where CUDA errors would be thrown.
// If this destructor runs during stack unwinding, then a different error has
// already happened that should be reported; so we only clean up the resource.
if (std::uncaught_exception())
cudaEventDestroy(m_done);
else
{
// failures in a prior launch might be reported here
CUDA_CALL(cudaEventRecord(m_done));
CUDA_CALL(cudaEventSynchronize(m_done));
CUDA_CALL(cudaEventDestroy(m_done));
}
}
}
template <typename AllocatedElemType>
AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numRows, size_t numCols)
{
@ -1911,7 +1949,8 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
while (p / 2 > width)
p = p / 2;
_computeNceOutput<ElemType><<<GetNumElements() / 2, p>>>(
// note: kernel has hard-coded dimension of 512
_computeNceOutputMax512Threads<ElemType> << <GetNumElements() / 2, p >> >(
Data(),
sampleCount,
m_numRows / 2,
@ -1925,7 +1964,8 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
while (p / 2 > GetNumElements() / 2)
p = p / 2;
// summing up objective must be done in one block
_assignNoiseContrastiveEstimation<ElemType><<<1, p>>>(
// note: kernel has hard-coded dimension of 512
_assignNoiseContrastiveEstimationMax512Threads<ElemType> << <1, p >> >(
Data(),
sampleCount,
m_numRows / 2,
@ -1970,7 +2010,8 @@ void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatr
while (p / 2 > width)
p = p / 2;
_assignSoftmaxSum<ElemType><<<1, p>>>(
// note: kernel has hard-coded dimension of 512
_assignSoftmaxSumMax512Threads<ElemType> << <1, p >> >(
my_a.Data(),
width,
Data(),
@ -2046,7 +2087,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignLogSoftmaxOf(const GPUMatrix<Ele
CUDA_LONG N = (CUDA_LONG) GetNumCols();
CUDA_LONG M = (CUDA_LONG) GetNumRows();
SyncGuard syncGuard;
_assignColumnwiseLogSoftmaxOf<<<N, 512, 0, t_stream>>>(a.Data(), Data(), N, M);
// note: kernel uses hard-coded thread dimension
_assignColumnwiseLogSoftmaxOf512Threads<<<N, 512, 0, t_stream>>>(a.Data(), Data(), N, M);
}
else
{
@ -2072,7 +2114,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignHardmaxOf(const GPUMatrix<ElemTy
CUDA_LONG N = (CUDA_LONG) GetNumCols();
CUDA_LONG M = (CUDA_LONG) GetNumRows();
SyncGuard syncGuard;
_assignColumnwiseHardmaxOf<<<N, 512, 0, t_stream>>>(a.Data(), Data(), N, M);
// note: kernel uses hard-coded thread dimension
_assignColumnwiseHardmaxOf512Threads << <N, 512, 0, t_stream >> >(a.Data(), Data(), N, M);
}
else
{
@ -2224,7 +2267,8 @@ ElemType GPUMatrix<ElemType>::SumOfElements() const
ElemType h_sum;
// WARNING: THIS kernel is not the most efficient way!
_reductionSum<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_sum, (CUDA_LONG) GetNumElements());
// note: kernel has hard-coded dimension of 1024
_reductionSum1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_sum, (CUDA_LONG)GetNumElements());
CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);
return h_sum;
@ -2241,7 +2285,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOfElements(const GPUMatrix<El
PrepareDevice();
SyncGuard syncGuard;
// WARNING: THIS kernel is not the most efficient way!
_reductionSumAndAssign<ElemType><<<1, 1024>>>(Data(), a.Data(), (CUDA_LONG) a.GetNumElements(), (CUDA_LONG) GetNumElements());
// note: kernel has hard-coded dimension of 1024
_reductionSumAndAssign1024Threads<ElemType> << <1, 1024 >> >(Data(), a.Data(), (CUDA_LONG)a.GetNumElements(), (CUDA_LONG)GetNumElements());
return (*this);
}
@ -2253,7 +2298,8 @@ DeviceBoundNumber<ElemType> GPUMatrix<ElemType>::Sum_AsDeviceBoundNum() const
ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
// WARNING: THIS kernel is not the most efficient way!
_reductionSum<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_sum, (CUDA_LONG) GetNumElements());
// note: kernel has hard-coded dimension of 1024
_reductionSum1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_sum, (CUDA_LONG)GetNumElements());
DeviceBoundNumber<ElemType> result;
result.ShallowCopyFrom(d_sum, GetComputeDeviceId());
return result;
@ -2555,7 +2601,8 @@ ElemType GPUMatrix<ElemType>::FrobeniusNorm() const
ElemType h_sum = 0;
// WARNING: THIS kernel is not the most efficient way!
_reductionSum2<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_sum, (CUDA_LONG) GetNumElements(), true);
// note: kernel has hard-coded dimension of 1024
_reductionSum21024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_sum, (CUDA_LONG)GetNumElements(), true);
CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);
@ -2572,7 +2619,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignFrobeniusNormOf(const GPUMatrix<
PrepareDevice();
// WARNING: THIS kernel is not the most efficient way!
_reductionSum2<ElemType><<<1, 1024, 0, t_stream>>>(a.Data(), Data(), (CUDA_LONG) a.GetNumElements(), true);
// note: kernel has hard-coded dimension of 1024
_reductionSum21024Threads<ElemType> << <1, 1024, 0, t_stream >> >(a.Data(), Data(), (CUDA_LONG)a.GetNumElements(), true);
return *this;
}
@ -2581,13 +2629,14 @@ template <class ElemType>
ElemType GPUMatrix<ElemType>::MatrixNormInf() const
{
if (IsEmpty())
LogicError("MatrixNorm1: Matrix is empty.");
LogicError("MatrixNormInf: Matrix is empty.");
ElemType* d_maxAbs = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
ElemType h_maxAbs = 0;
// WARNING: THIS kernel is not the most efficient way!
_reductionMatrixNormInf<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_maxAbs, (CUDA_LONG) GetNumElements());
// note: kernel has hard-coded dimension of 1024
_reductionMatrixNormInf1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_maxAbs, (CUDA_LONG)GetNumElements());
CUDA_CALL(cudaMemcpy(&h_maxAbs, d_maxAbs, sizeof(ElemType), cudaMemcpyDeviceToHost));
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_maxAbs);
return h_maxAbs;
@ -2610,7 +2659,8 @@ ElemType GPUMatrix<ElemType>::MatrixNorm0() const
ElemType* d_nz = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
ElemType h_nz = 0;
// WARNING: THIS kernel is not the most efficient way!
_reductionMatrixNorm0<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_nz, (CUDA_LONG) GetNumElements());
// note: kernel has hard-coded dimension of 1024
_reductionMatrixNorm01024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_nz, (CUDA_LONG)GetNumElements());
CUDA_CALL(cudaMemcpy(&h_nz, d_nz, sizeof(ElemType), cudaMemcpyDeviceToHost));
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_nz);
return h_nz;
@ -2667,7 +2717,8 @@ void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<E
maxIndexes.RequireSize(1, n);
int blocksPerGrid = n; // we'll have 1 block processing 1 column
_vectorMaxMinReduce<ElemType, true><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(us.Data(), maxIndexes.Data(), maxValues.Data(), m, n);
// note: kernel has hard-coded dimension of 512
_vectorMaxMinReduce512Threads<ElemType, true><<<blocksPerGrid, 512, 0, t_stream>>>(us.Data(), maxIndexes.Data(), maxValues.Data(), m, n);
/*int blocksPerGrid=(int)ceil(1.0*n/GridDim::maxThreadsPerBlock);
_vectorMax<ElemType><<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(us.Data(),maxIndexes.Data(),maxValues.Data(),m,n,isColWise);*/
@ -2793,7 +2844,8 @@ void GPUMatrix<ElemType>::VectorMin(GPUMatrix<ElemType>& minIndexes, GPUMatrix<E
minIndexes.RequireSize(1, n);
int blocksPerGrid = n; // we'll have 1 block processing 1 column
_vectorMaxMinReduce<ElemType, false><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(us.Data(), minIndexes.Data(), minValues.Data(), m, n);
// note: kernel has hard-coded dimension of 512
_vectorMaxMinReduce512Threads<ElemType, false> << <blocksPerGrid, 512, 0, t_stream >> >(us.Data(), minIndexes.Data(), minValues.Data(), m, n);
/*
int blocksPerGrid=(int)ceil(1.0*n/GridDim::maxThreadsPerBlock);
@ -2823,8 +2875,9 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemTy
if (!searchInCol)
{
// int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/GridDim::maxThreadsPerBlock);
// _assignNumOfDiff<ElemType><<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(a.Data(), b.Data(), Data(), a.GetNumElements());
_assignNumOfDiff<ElemType><<<1, 1024, 0, t_stream>>>(a.Data(), b.Data(), Data(), (CUDA_LONG) a.GetNumElements());
// _assignNumOfDiff1024Threads<ElemType><<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(a.Data(), b.Data(), Data(), a.GetNumElements());
// note: kernel has hard-coded dimension of 1024
_assignNumOfDiff1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(a.Data(), b.Data(), Data(), (CUDA_LONG)a.GetNumElements());
}
else
{
@ -3107,6 +3160,7 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
Data(), (int)GetNumRows(), grad.Data(), (int)grad.GetNumRows());
}
// returns saveMean/saveInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
template <class ElemType>
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
@ -3122,10 +3176,13 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
assert(0 < vectorSize && vectorSize <= std::numeric_limits<int>::max());
assert(0 < batchSize && batchSize <= std::numeric_limits<int>::max());
// --- compute data mean/stddev (into saveMean/saveInvStdDev) and update running mean/stddev
SyncGuard syncGuard;
// If expAvgFactor == 0 && blendFactor == 1 then we don't need to compute current minibatch statistics.
if (expAvgFactor > 0 || blendFactor < 1)
{
saveMean.RequireSize(runMean);
saveInvStdDev.RequireSize(runMean);
if (spatial)
{
Call<ComputeSpatialBatchMeanAndInvStdDev, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, Data(),
@ -3139,35 +3196,50 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
saveMean.Data(), saveInvStdDev.Data(), GetStream());
}
}
else // not computing new statistics
{
saveMean.RequireSize(0, 0);
saveInvStdDev.RequireSize(0, 0);
}
// --- apply MAP estimates of mean/stddev (interpolation of data and running mean/stddev) to data
// When:
// blendFactor == 1 - use running mean/var instead of the current minibatch mean/var.
// blendFactor == 1 - use running mean/var instead of the current minibatch mean/var. Note: saveMean/saveInvStdDev are NOT produced.
// 0 < blendFactor < 1 - blend running mean/var with mean/var of the current minibatch: saveMean = (1 - blendFactor) * saveMean + blendFactor * runMean
// blendFactor == 0 - use mean/var of the current minibatch.
if (blendFactor < 1)
{
// non-zero blendFactor: interpolate minibatch mean/stddev in-place with running mean/stddev
if (blendFactor > 0)
{
// REVIEW alexeyk: can be rolled into NormalizeBatchTraining to save bandwidth.
// TODO: add a 'beta' parameter to ScaleAndAdd()
Scale((ElemType)(1 - blendFactor), saveMean);
ScaleAndAdd((ElemType)blendFactor, runMean, saveMean);
ScaleAndAdd((ElemType)blendFactor, /*in*/ runMean, /*in/out*/ saveMean);
Scale((ElemType)(1 - blendFactor), saveInvStdDev);
ScaleAndAdd((ElemType)blendFactor, runInvStdDev, saveInvStdDev);
}
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
spatial, Data(), out.Data(), scale.Data(), bias.Data(),
saveMean.Data(), saveInvStdDev.Data(), GetStream());
// normalize
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
Data(), out.Data(), // (in, out) data to be normalized -> normalized data
scale.Data(), bias.Data(), // (in) scale/bias to denormalize with
/*(in)*/saveMean.Data(), saveInvStdDev.Data(), // (in) actual mean/stddev to normalize with
GetStream());
}
else
else // blendFactor == 1: use running mean/stddev only
{
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
spatial, Data(), out.Data(), scale.Data(), bias.Data(),
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
Data(), out.Data(),
scale.Data(), bias.Data(),
runMean.Data(), runInvStdDev.Data(), GetStream());
// CNTK engine returns saveMean and saveInvStdDev empty, but cnDNN engine does not.
}
}
// saveMean/saveInvStdDev are the interpolated mean/stddev as used in ForwardProp().
// For blendFactor=1, they are not used and can be uninitialized or empty.
template <class ElemType>
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale,
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
{
@ -3192,8 +3264,9 @@ void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>&
Call<ComputeScaleAndBiasGradients, ElemType>(vectorSize, vectorSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(),
saveMean.Data(), saveInvStdDev.Data(), GetStream());
}
ElemType mbStatsWeight = (ElemType)(1 - blendFactor); // weight for contribution from actual MB stats (0 if none, e.g. locked BN node)
Call<BackpropagateBatchNormGradients, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
in.Data(), Data(), grad.Data(), scale.Data(), scaleGrad.Data(), biasGrad.Data(), saveMean.Data(), saveInvStdDev.Data(), GetStream());
in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), saveMean.Data(), saveInvStdDev.Data(), GetStream());
}
#pragma region Static BLAS Functions
@ -3990,7 +4063,8 @@ ElemType GPUMatrix<ElemType>::GetLearnRateForBlock_Helper(const GPUMatrix<ElemTy
}
// d_res[0] should now contain inner product of matrices
// Compute squared Frobenius norms (squared sums of elements)
_lrHelper<ElemType><<<1, 512, 0, t_stream>>>(Gradients.Data(), SmoothedGradients.Data(), (CUDA_LONG) Gradients.GetNumElements(), d_res);
// note: kernel has hard-coded dimension of 512
_lrHelper512Threads<ElemType> << <1, 512, 0, t_stream >> >(Gradients.Data(), SmoothedGradients.Data(), (CUDA_LONG)Gradients.GetNumElements(), d_res);
ElemType res;
CUDA_CALL(cudaMemcpy(&res, d_res, sizeof(ElemType), cudaMemcpyDeviceToHost));
TracingGPUMemoryAllocator::Free<ElemType>(Gradients.GetComputeDeviceId(), d_res);
@ -4214,16 +4288,21 @@ void GPUMatrix<ElemType>::RCRFBackwardCompute(
ElemType* d_zeta = TracingGPUMemoryAllocator::Allocate<ElemType>(alpha.GetComputeDeviceId(), iNumLab);
CUDA_LONG N = iNumLab;
int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock);
// TODO: change all three '512' to 'GridDim::maxThreadsPerBlock' (not doing this now since I cannot test it)
int blocksPerGrid = (int) ceil(1.0 * N / 512);
size_t szMemSize;
for (int t = iNumPos - 1; t >= 0; t--)
{
szMemSize = sizeof(ElemType) * iNumLab;
_rcrfBackwardComputeZeta<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
assert(iNumLab <= 1024);
_rcrfBackwardComputeZetaMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
szMemSize = iNumLab * 3;
szMemSize *= sizeof(ElemType);
_rcrfBackwardCompute<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t, iNumPos, alpha.Data(), beta.Data(),
d_zeta, pair_scores.Data(), iNumLab, shift);
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == 3 * iNumLab.
assert(iNumLab <= 1024);
_rcrfBackwardComputeMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t, iNumPos, alpha.Data(), beta.Data(),
d_zeta, pair_scores.Data(), iNumLab, shift);
}
/*
error = cudaGetErrorString(cudaPeekAtLastError());
@ -4255,16 +4334,22 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
ElemType* d_zeta = TracingGPUMemoryAllocator::Allocate<ElemType>(alpha.GetComputeDeviceId(), iNumLab);
CUDA_LONG N = iNumLab;
int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock);
// TODO: change all three '512' to 'GridDim::maxThreadsPerBlock' (not doing this now since I cannot test it)
int blocksPerGrid = (int)ceil(1.0 * N / 512);
size_t szMemSize;
for (int t = 0; t < iNumPos; t++)
{
szMemSize = sizeof(ElemType) * iNumLab;
_rcrfTransGrdComputeZeta<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
assert(iNumLab <= 1024);
// BUGBUG: This is launched with 512 threads per block, but allocates shared mem as if there is only one block. Likewise for all 4 of these functions.
_rcrfTransGrdComputeZetaMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
szMemSize = iNumLab * 3;
szMemSize *= sizeof(ElemType);
_rcrfTransGrdCompute<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t, startLbl, alpha.Data(), beta.Data(),
d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
assert(iNumLab <= 1024);
_rcrfTransGrdComputeMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t, startLbl, alpha.Data(), beta.Data(),
d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
}
TracingGPUMemoryAllocator::Free<ElemType>(alpha.GetComputeDeviceId(), d_zeta);
};
@ -4278,11 +4363,16 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
template <class ElemType>
static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE deviceId)
{
// using an array of shared_ptrs because those are thread-safe. The objects themselves are immutable.
// And using a plain array so this will never get freed, avoiding free-after-DLL-unload issues.
static shared_ptr<GPUMatrix<ElemType>> onesCache[32]; // cache of objects
if (deviceId >= _countof(onesCache))
LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", (int) _countof(onesCache), (int) deviceId + 1);
// using a dynamically allocated array so this will never get freed, avoiding free-after-DLL-unload issues.
// and using shared_ptrs since we don't want to leak more than CacheSize elements
// when using a plain array we would have to control lifetime of the object and destructor would be called for every element in the array at the end
const int CacheSize = 32;
static shared_ptr<GPUMatrix<ElemType>> * onesCache = new shared_ptr<GPUMatrix<ElemType>>[CacheSize]; // cache of objects
if (deviceId >= CacheSize){
LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", CacheSize, (int)deviceId + 1);
}
auto p = onesCache[deviceId];
if (!p || p->GetNumRows() < N) // must (re-)allocate
{
@ -4300,8 +4390,11 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum) // TODO: enable the reduction ops
InvalidArgument("TensorOp: Unary reduction operations other than opSum not yet implemented.");
if (reductionOp != ElementWiseOperator::opSum &&
reductionOp != ElementWiseOperator::opLogSum &&
reductionOp != ElementWiseOperator::opMin &&
reductionOp != ElementWiseOperator::opMax)
InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
a.PrepareDevice();
if (a.GetComputeDeviceId() != GetComputeDeviceId())
@ -4322,10 +4415,11 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
return LaunchUnaryTensorOp<ElemType>(beta, a.Data()+ offsets[0], Data()+ offsets[1], alpha, op, regularOpDims[0]);
}
// special case: reducing a matrix onto a column vector; can be done with SGEMM
// special case: sum-reducing a matrix onto a column vector; can be done with SGEMM
// Note: A minor risk is that with this, our own reduction function will rarely be used.
// That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation
reductionOp == ElementWiseOperator::opSum &&
#ifdef _DEBUG
sizeof(ElemType) == sizeof(float) && // in debug don't shortcut 'double' so we have some test of our own codepath
#endif
@ -4348,7 +4442,7 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
// regular case
else
return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
@ -4365,7 +4459,7 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId())
InvalidArgument("All matrices must be on the same GPU");
return TensorOpN<ElemType, 3>(beta, array<ElemType*, 3>{a.Data(), b.Data(), Data()}, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpN<ElemType, 3>(beta, array<ElemType*, 3>{a.Data(), b.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
// perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
@ -4381,7 +4475,7 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
a.PrepareDevice();
if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId() || c.GetComputeDeviceId() != GetComputeDeviceId())
InvalidArgument("All matrices must be on the same GPU");
return TensorOpN<ElemType, 4>(beta, array<ElemType*, 4>{a.Data(), b.Data(), c.Data(), Data()}, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpN<ElemType, 4>(beta, array<ElemType*, 4>{a.Data(), b.Data(), c.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
// =======================================================================
@ -4420,24 +4514,50 @@ template void GPUMatrix<char>::SetValue(const size_t numRows, const size_t numCo
template void GPUMatrix<char>::SetValue(GPUMatrix<char> const&);
//template void GPUMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
//template void GPUMatrix<char>::SetValue(GPUSparseMatrix<char> const&);
template void GPUMatrix<char>::CopySection(size_t numRows, size_t numCols, char* dst, size_t colStride) const;
template void GPUMatrix<char>::Reshape(const size_t, const size_t);
template GPUMatrix<char>& GPUMatrix<char>::operator*=(char);
template DEVICEID_TYPE GPUMatrix<char>::PrepareDevice(DEVICEID_TYPE deviceId) const;
// Support <short>
template GPUMatrix<short>::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId);
template GPUMatrix<short>::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId, short* pArray, const size_t matrixFlags);
template GPUMatrix<short>::GPUMatrix(const GPUMatrix<short>&);
template GPUMatrix<short>::GPUMatrix(GPUMatrix<short>&&);
template short* GPUMatrix<short>::CopyToArray() const;
template void GPUMatrix<short>::ChangeDeviceTo(int);
template void GPUMatrix<short>::Resize(size_t, size_t, bool);
template void GPUMatrix<short>::RequireSize(size_t, size_t, bool);
template GPUMatrix<short>::~GPUMatrix();
template GPUMatrix<short> GPUMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
template GPUMatrix<short>& GPUMatrix<short>::operator=(GPUMatrix<short>&&);
template GPUMatrix<short>::GPUMatrix(int);
template void GPUMatrix<short>::SetValue(const short);
template void GPUMatrix<short>::SetValue(const size_t numRows, const size_t numCols, int deviceId, short* pArray, size_t matrixFlags);
//template void GPUMatrix<short>::SetValue(CPUMatrix<short> const&);
template void GPUMatrix<short>::SetValue(GPUMatrix<short> const&);
//template void GPUMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
//template void GPUMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
template void GPUMatrix<short>::CopySection(size_t numRows, size_t numCols, short* dst, size_t colStride) const;
template void GPUMatrix<short>::Reshape(const size_t, const size_t);
template GPUMatrix<short>& GPUMatrix<short>::operator*=(short);
template DEVICEID_TYPE GPUMatrix<short>::PrepareDevice(DEVICEID_TYPE deviceId) const;
template GPUMatrix<int>::GPUMatrix(const size_t, const size_t, int, int*, const size_t);
template GPUMatrix<int>::~GPUMatrix();
template int* TracingGPUMemoryAllocator::Allocate<int>(int, size_t);
template size_t* TracingGPUMemoryAllocator::Allocate<size_t>(int, size_t);
template long* TracingGPUMemoryAllocator::Allocate<long>(int, size_t);
template short* TracingGPUMemoryAllocator::Allocate<short>(int, size_t);
template char* TracingGPUMemoryAllocator::Allocate<char>(int, size_t);
template float* TracingGPUMemoryAllocator::Allocate<float>(int, size_t);
template double* TracingGPUMemoryAllocator::Allocate<double>(int, size_t);
template void TracingGPUMemoryAllocator::Free<int>(int, int*, bool);
template void TracingGPUMemoryAllocator::Free<size_t>(int, size_t*, bool);
template void TracingGPUMemoryAllocator::Free<short>(int, short*, bool);
template void TracingGPUMemoryAllocator::Free<char>(int, char*, bool);
template void TracingGPUMemoryAllocator::Free<float>(int, float*, bool);
template void TracingGPUMemoryAllocator::Free<double>(int, double*, bool);

Просмотреть файл

@ -61,6 +61,27 @@ cudaStream_t MATH_API GetStream();
namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// SyncGuard -- synchronize around CUDA calls
// -----------------------------------------------------------------------
class SyncGuard
{
private:
static bool s_isSyncEnabled;
bool m_forceSync;
#ifndef CPUONLY
cudaEvent_t m_done;
#endif
public:
static MATH_API void EnableSync();
SyncGuard(bool forceSync = false);
~SyncGuard();
};
// -----------------------------------------------------------------------
// DeviceBoundNumber -- This class represents a number which resides on a particular device. Use it to avoid unnecessary transfers between CPU and GPU
// -----------------------------------------------------------------------
@ -207,18 +228,14 @@ public:
// multiple views, RequireSize will first check to see if Resize is required. If it is not, then it short-circuits and is a noop. Otherwise, RequireSize
// will call Resize, which may fail if the matrix has multiple views.
void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly); }
// Resize first checks to ensure that the caller has the authority to call Resize (i.e., it checks to ensure the underlying data is owned by only this matrix), and then
// actually resizes the underlying matrix, doing any allocation as required.
void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
ElemType& operator()(const size_t /*row*/, const size_t /*col*/)
{
LogicError("GPUMatrix doesn't support this");
}
const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const
{
LogicError("GPUMatrix doesn't support this");
}
ElemType& operator()(const size_t /*row*/, const size_t /*col*/) { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
ElemType Get00Element() const;
void SetValue(const ElemType v);
@ -453,7 +470,8 @@ public:
void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const;
void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const;
public:
@ -623,51 +641,4 @@ static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libNam
#define CURAND_CALL(expr) (CudaCall((expr), #expr, "CURAND", CURAND_STATUS_SUCCESS))
#define CUDNN_CALL(expr) (CudaCall((expr), #expr, "cuDNN", CUDNN_STATUS_SUCCESS))
// -----------------------------------------------------------------------
// SyncGuard -- synchronize around CUDA calls
// -----------------------------------------------------------------------
class SyncGuard
{
static bool DoSync()
{
#ifdef NO_SYNC // this strange way of writing it allows modifying this variable at runtime in the debugger
static bool do_sync = false;
#else
static bool do_sync = true;
#endif
return do_sync;
}
cudaEvent_t m_done;
public:
SyncGuard()
{
m_done = nullptr;
if (DoSync())
{
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaEventCreate(&m_done));
}
}
~SyncGuard()
{
if (DoSync())
{
// The regular use of this destructor is to synchronize the GPU, but also
// to check for errors. So this destructor is where CUDA errors would be thrown.
// If this destructor runs during stack unwinding, then a different error has
// already happened that should be reported; so we only clean up the resource.
if (std::uncaught_exception())
cudaEventDestroy(m_done);
else
{
// failures in a prior launch might be reported here
CUDA_CALL(cudaEventRecord(m_done));
CUDA_CALL(cudaEventSynchronize(m_done));
CUDA_CALL(cudaEventDestroy(m_done));
}
}
}
};
#endif // CPUONLY

Просмотреть файл

@ -95,8 +95,8 @@ static INT CeilDiv(INT a, INT2 b) // ceil(a/b)
struct GridDim
{
static const CUDA_LONG maxThreadsPerBlock = 512; // use this many threads per block
static const CUDA_LONG maxWarpsPerBlock = 16; // use this many warps per block. This means 512 threads for warpSize=32
static const CUDA_LONG maxThreadsPerBlock = 1024; // use this many threads per block
static const CUDA_LONG maxWarpsPerBlock = 32; // use this many warps per block. This means 1024 threads for warpSize=32
// use these for launching
// GridDim grid(NN);
@ -127,7 +127,7 @@ struct GridDim
}
// put it back together
m_threadsPerBlock = warpsPerProc * warpSize; // =a multiple of 32 that is as close to 512 as makes sense given NN
m_threadsPerBlock = warpsPerProc * warpSize; // =a multiple of 32 that is as close to 1024 as makes sense given NN
m_blocksPerGrid = CeilDiv(N, m_threadsPerBlock);
if (m_blocksPerGrid == 1)
m_threadsPerBlock = N; // don't launch more than necessary --TODO: Does this make a difference at all?
@ -847,7 +847,7 @@ __global__ void _logSoftMaxColWise(
// each block processes one column. There must be 512 threads in a block
template <class ElemType>
__global__ void _assignColumnwiseLogSoftmaxOf(
__global__ void _assignColumnwiseLogSoftmaxOf512Threads(
const ElemType* a,
ElemType* us,
const CUDA_LONG m_numCols,
@ -1015,7 +1015,7 @@ __global__ void _logSoftMaxRowWise(
// each block processes one column. There must be 512 threads in a block
template <class ElemType>
__global__ void _assignColumnwiseHardmaxOf(
__global__ void _assignColumnwiseHardmaxOf512Threads(
const ElemType* a,
ElemType* us,
const CUDA_LONG m_numCols,
@ -2198,7 +2198,7 @@ __global__ void _addSignOf(
// This function processes 1 column per block. this function needs 512 threads
template <class ElemType, bool IsMax>
__global__ void _vectorMaxMinReduce(
__global__ void _vectorMaxMinReduce512Threads(
const ElemType* us,
ElemType* Indexes,
ElemType* Values,
@ -2585,7 +2585,7 @@ __global__ void _addElementToElement(
}
template <class ElemType>
__global__ void _assignNumOfDiff(
__global__ void _assignNumOfDiff1024Threads(
const ElemType* a,
const ElemType* b,
ElemType* c,
@ -2664,7 +2664,7 @@ __global__ void _assignNumOfDiff(
}
/*template<class ElemType>
__global__ void _assignNumOfDiff(
__global__ void _assignNumOfDiff1024Threads(
ElemType *a,
ElemType *b,
ElemType *c,
@ -3343,8 +3343,9 @@ __global__ void _computeGradientOfInput(
}
#endif
#if 0
template <class ElemType>
__global__ void computeNCEForwardProp(
__global__ void computeNCEForwardProp512Threads(
const ElemType* val,
const int* col,
int numRows,
@ -3406,9 +3407,10 @@ __global__ void computeNCEForwardProp(
res[i] = partials[0];
}
}
#endif
template <class ElemType>
__global__ void _computeNceOutput(
__global__ void _computeNceOutputMax512Threads(
const ElemType* col,
int numRows,
int sampleCount,
@ -3477,7 +3479,7 @@ __global__ void _computeNceOutput(
}
template <class ElemType>
__global__ void _assignSoftmaxSum(
__global__ void _assignSoftmaxSumMax512Threads(
const ElemType* softmax,
int sampleCount,
const ElemType* a,
@ -3489,7 +3491,7 @@ __global__ void _assignSoftmaxSum(
// col is an array contains index of the word samples
// a is a matrix in column major format contains output from hidden layer
// b is the weight matrix for output layer
// tmp is the buffer that stores NCE output calculated from _computeNceOutput
// tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
// c is the matrix to store objective
__shared__ ElemType partials[512];
@ -3529,7 +3531,7 @@ __global__ void _assignSoftmaxSum(
}
template <class ElemType>
__global__ void _assignNoiseContrastiveEstimation(
__global__ void _assignNoiseContrastiveEstimationMax512Threads(
const ElemType* val,
int numRows,
int sampleCount,
@ -3545,7 +3547,7 @@ __global__ void _assignNoiseContrastiveEstimation(
// col is an array contains index of the word samples
// a is a matrix in column major format contains output from hidden layer
// b is the weight matrix for output layer
// tmp is the buffer that stores NCE output calculated from _computeNceOutput
// tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
// c is the matrix to store objective
__shared__ ElemType partials[512];
@ -3863,7 +3865,7 @@ __global__ void _normalGradForSparseBlock(
//This function should be called with 1024 threads per block and 1 block
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
template <class ElemType>
__global__ void _reductionSum(
__global__ void _reductionSum1024Threads(
const ElemType* data,
ElemType* sum,
CUDA_LONG N)
@ -3944,7 +3946,7 @@ __global__ void _reductionSum(
//This function should be called with 1024 threads per block and 1 block
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
template <class ElemType>
__global__ void _reductionSumAndAssign(
__global__ void _reductionSumAndAssign1024Threads(
ElemType* toAssign,
const ElemType* data,
CUDA_LONG N, // length of data
@ -4028,7 +4030,7 @@ __global__ void _reductionSumAndAssign(
//This function should be called with 1024 threads per block and 1 block
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
template <class ElemType>
__global__ void _reductionSum2(
__global__ void _reductionSum21024Threads(
const ElemType* data,
ElemType* sum,
CUDA_LONG N,
@ -4118,7 +4120,7 @@ __global__ void _reductionSum2(
//This function should be called with 1024 threads per block and 1 block
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
template <class ElemType>
__global__ void _reductionMatrixNormInf(
__global__ void _reductionMatrixNormInf1024Threads(
const ElemType* data,
ElemType* maxAbs,
CUDA_LONG N)
@ -4206,7 +4208,7 @@ __global__ void _reductionMatrixNormInf(
//This function should be called with 1024 threads per block and 1 block
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
template <class ElemType>
__global__ void _reductionMatrixNorm0(
__global__ void _reductionMatrixNorm01024Threads(
const ElemType* data,
ElemType* nz,
CUDA_LONG N)
@ -4306,7 +4308,7 @@ __global__ void _getSparseVectorRepresntationForCSCMatrix(
}
template <class ElemType>
__global__ void _lrHelper(
__global__ void _lrHelper512Threads(
const ElemType* data1,
const ElemType* data2,
const CUDA_LONG N,
@ -4408,7 +4410,7 @@ __global__ void _lrHelper(
/*
template<class ElemType>
__global__ void _lrHelper(
__global__ void _lrHelper512Threads(
ElemType* d_tmp)
{
if (sizeof(ElemType)==sizeof(float))
@ -4572,83 +4574,11 @@ __global__ void _minusOneAt(
c[id] = c[id] - 1.0;
}
// the kernel function for RCRF backward computation
// the kernel function for CRFLSTMNetwork backward computation
// assume a column slice of input and output
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == 3 * iNumLab.
template <class ElemType>
__global__ void _rcrfBackwardCompute(
const size_t iNumPos,
const ElemType* galpha, // column slice at current time t
ElemType* gbeta, // column slices with [row, 2] at current time t for [
const ElemType* gpair_scores,
const size_t iNumLab, const int shift)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
// need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
ElemType* pair_scores = alpha + iNumPos * iNumLab;
ElemType* beta = alpha + iNumPos * iNumLab + iNumLab * iNumLab;
if (id < 0 || id >= iNumLab)
return;
// copy global memory to shared memory to save time
for (int t = iNumPos - 1; t >= 0; t--)
{
alpha[IDX2C(id, t, iNumLab)] = galpha[IDX2C(id, t, iNumLab)];
}
for (int j = 0; j < iNumLab; j++)
pair_scores[IDX2C(id, j, iNumLab)] = gpair_scores[IDX2C(id, j, iNumLab)];
__syncthreads();
for (int t = iNumPos - 1; t >= 0; t--)
{
ElemType fSum;
ElemType fTmp = LZERO;
if (t == iNumPos - 1)
{
fSum = LZERO;
for (int j = 0; j < iNumLab; j++)
{
fSum = logaddk(fSum, alpha[IDX2C(j, t, iNumLab)]);
}
fTmp = alpha[IDX2C(id, t, iNumLab)] - fSum;
}
else
{
for (int j = 0; j < iNumLab; j++)
{
fSum = LZERO;
for (int m = 0; m < iNumLab; m++)
{
fSum = logaddk(fSum, alpha[IDX2C(m, t, iNumLab)] + pair_scores[IDX2C(j, m, iNumLab)]);
}
fTmp = logaddk(fTmp, beta[IDX2C(j, t + 1, iNumLab)] + alpha[IDX2C(id, t, iNumLab)] + pair_scores[IDX2C(j, id, iNumLab)] - fSum);
}
}
beta[IDX2C(id, t, iNumLab)] = fTmp;
__syncthreads();
}
// copy from shared memory to global memory to pass values
for (int t = iNumPos - 1; t >= 0; t--)
{
gbeta[IDX2C(id, t, iNumLab)] = beta[IDX2C(id, t, iNumLab)];
}
// __syncthreads();
}
/// the kernel function for CRFLSTMNetwork backward computation
/// assume a column slice of input and output
template <class ElemType>
__global__ void _rcrfBackwardCompute(
__global__ void _rcrfBackwardComputeMax1024Labels(
const size_t t, // time position
const size_t iNumPos,
const ElemType* galpha, // column slice at current time t
@ -4659,13 +4589,13 @@ __global__ void _rcrfBackwardCompute(
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
// need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
extern __shared__ double sh_alpha_and_beta[]; // [id] or [id + iNumLab] or [id + 2 * iNumLab)]
// need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
ElemType* beta_t1 = (ElemType*) (alpha + iNumLab);
ElemType* zeta = (ElemType*) (beta_t1 + iNumLab);
ElemType pair_scores[1024];
ElemType pair_scores[1024]; // [j=0..iNumLab-1]
if (id < 0 || id >= iNumLab)
return;
@ -4697,9 +4627,10 @@ __global__ void _rcrfBackwardCompute(
gbeta[IDX2C(id, t, iNumLab)] = fTmp;
}
/// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
template <class ElemType>
__global__ void _rcrfBackwardComputeZeta(
__global__ void _rcrfBackwardComputeZetaMax1024Labels(
const size_t t, // time position
const size_t iNumPos,
const ElemType* galpha, // column slice at current time t
@ -4709,11 +4640,11 @@ __global__ void _rcrfBackwardComputeZeta(
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
// need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
extern __shared__ double sh_alpha_and_beta[]; // [id]
// need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
ElemType pair_scores[1024];
ElemType pair_scores[1024]; // [j=0..iNumLab-1]
if (id < 0 || id >= iNumLab)
return;
@ -4739,8 +4670,9 @@ __global__ void _rcrfBackwardComputeZeta(
}
/// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
template <class ElemType>
__global__ void _rcrfTransGrdComputeZeta(
__global__ void _rcrfTransGrdComputeZetaMax1024Labels(
const int t, // time position
const size_t iNumPos,
const ElemType* galpha, // column slice at current time t
@ -4752,11 +4684,11 @@ __global__ void _rcrfTransGrdComputeZeta(
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
// need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
extern __shared__ double sh_alpha_and_beta[]; // [id]
// need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
ElemType pair_scores[1024];
ElemType pair_scores[1024]; // [j=0..iNumLab-1]
if (id < 0 || id >= iNumLab)
return;
@ -4790,8 +4722,9 @@ __global__ void _rcrfTransGrdComputeZeta(
gzeta[id] = fSum;
}
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
template <class ElemType>
__global__ void _rcrfTransGrdCompute(
__global__ void _rcrfTransGrdComputeMax1024Labels(
int t,
const size_t start_lbl,
const ElemType* galpha,
@ -4806,13 +4739,13 @@ __global__ void _rcrfTransGrdCompute(
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
// need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
extern __shared__ double sh_alpha_and_beta[]; // [id]
// need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
ElemType* beta = (ElemType*) (alpha + iNumLab);
ElemType* zeta = (ElemType*) (beta + iNumLab);
ElemType pair_scores[1024];
ElemType pair_scores[1024]; // [j=0..iNumLab-1]
if (id < 0 || id >= iNumLab)
return;

Просмотреть файл

@ -2290,7 +2290,7 @@ ElemType GPUSparseMatrix<ElemType>::SumOfElements() const
ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
ElemType h_sum;
// WARNING: THIS kernel is not the most efficient way!
_reductionSum<ElemType><<<1, 1024>>>(NzValues(), d_sum, (LONG64) GetNumNZElements());
_reductionSum1024Threads<ElemType><<<1, 1024>>>(NzValues(), d_sum, (LONG64) GetNumNZElements());
CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);
@ -2307,7 +2307,7 @@ ElemType GPUSparseMatrix<ElemType>::FrobeniusNorm() const
ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
ElemType h_sum = 0;
// WARNING: THIS kernel is not the most efficient way!
_reductionSum2<ElemType><<<1, 1024>>>(NzValues(), d_sum, (int) GetNumNZElements());
_reductionSum21024Threads<ElemType><<<1, 1024>>>(NzValues(), d_sum, (int) GetNumNZElements());
CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);
@ -2326,7 +2326,7 @@ ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
ElemType* d_maxAbs = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
ElemType h_maxAbs = 0;
// WARNING: THIS kernel is not the most efficient way!
_reductionMatrixNormInf<ElemType><<<1, 1024>>>(NzValues(), d_maxAbs, (int) GetNumNZElements());
_reductionMatrixNormInf1024Threads<ElemType><<<1, 1024>>>(NzValues(), d_maxAbs, (int) GetNumNZElements());
CUDA_CALL(cudaMemcpy(&h_maxAbs, d_maxAbs, sizeof(ElemType), cudaMemcpyDeviceToHost));
TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_maxAbs);
@ -2689,7 +2689,6 @@ template void GPUSparseMatrix<char>::CopyToCPUSparseMatrix(CPUSparseMatrix<char>
template void GPUSparseMatrix<char>::ChangeDeviceTo(int);
template void GPUSparseMatrix<char>::Resize(const size_t, const size_t, const size_t, const bool);
template void GPUSparseMatrix<char>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);
template void GPUSparseMatrix<int>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);
template void GPUSparseMatrix<char>::Reset();
template GPUSPARSE_INDEX_TYPE GPUSparseMatrix<char>::SecondaryIndexValueAt(size_t) const;
template GPUSparseMatrix<char>::~GPUSparseMatrix();
@ -2699,8 +2698,32 @@ template GPUSparseMatrix<char>& GPUSparseMatrix<char>::operator=(GPUSparseMatrix
template void GPUSparseMatrix<char>::Reshape(const size_t, const size_t);
template void GPUSparseMatrix<char>::ScaleAndAdd(char, GPUSparseMatrix<char> const &, GPUMatrix<char> &);
// Support <short>
template GPUSparseMatrix<short>::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat);
template GPUSparseMatrix<short>::GPUSparseMatrix(const size_t, const size_t, const size_t, DEVICEID_TYPE, const MatrixFormat);
template GPUSparseMatrix<short>::GPUSparseMatrix(GPUSparseMatrix<short> const&);
template GPUSparseMatrix<short>::GPUSparseMatrix(GPUSparseMatrix<short>&&);
template void GPUSparseMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
template void GPUSparseMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
template void GPUSparseMatrix<short>::SetValue(GPUMatrix<short> const&);
//template void GPUSparseMatrix<short>::SetValue(CPUMatrix<short> const&);
template void GPUSparseMatrix<short>::CopyToDenseMatrix(GPUMatrix<short>&) const;
template void GPUSparseMatrix<short>::CopyToCPUSparseMatrix(CPUSparseMatrix<short>&) const;
template void GPUSparseMatrix<short>::ChangeDeviceTo(int);
template void GPUSparseMatrix<short>::Resize(const size_t, const size_t, const size_t, const bool);
template void GPUSparseMatrix<short>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);
template void GPUSparseMatrix<short>::Reset();
template GPUSPARSE_INDEX_TYPE GPUSparseMatrix<short>::SecondaryIndexValueAt(size_t) const;
template GPUSparseMatrix<short>::~GPUSparseMatrix();
template GPUSparseMatrix<short> GPUSparseMatrix<short>::ColumnSlice(size_t, size_t) const;
template GPUMatrix<short> GPUSparseMatrix<short>::CopyColumnSliceToDense(size_t, size_t) const;
template GPUSparseMatrix<short>& GPUSparseMatrix<short>::operator=(GPUSparseMatrix<short>&&);
template void GPUSparseMatrix<short>::Reshape(const size_t, const size_t);
template void GPUSparseMatrix<short>::ScaleAndAdd(short, GPUSparseMatrix<short> const &, GPUMatrix<short> &);
template GPUSparseMatrix<int>::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat);
template GPUSparseMatrix<int>::~GPUSparseMatrix();
template void GPUSparseMatrix<int>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);
template <class ElemType>
MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)

Просмотреть файл

@ -19,6 +19,7 @@
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <assert.h>
#include<limits.h>
#ifndef let
#define let const auto
@ -47,9 +48,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// - supports general strides
// - input broadcasting is supported by stride=0
// - the operation is denoted by an opCode
// - reduction is supported, including summation (dual to broadcasting when computing gradients)
// - reduction operation is given by an opCode. Only a few specific opCodes may be used for reduction.
// Note: reduction opCodes are not implemented yet, only summation is supported.
// - reduction is supported, including summation, min, max (dual to broadcasting when computing gradients)
// - reduction operation is given by an opCode: opSum, opMin, opMax and opLogSum.
//
// This library makes extensive use of templates and macros.
// Specifically, templates are used recursively to recurse over tensor dimensions.
@ -261,6 +261,65 @@ struct TensorOps
}
};
//----------------------------------------------------------------------------
// For reductions we need the neutral elements of the corresponding binary ops
//----------------------------------------------------------------------------
template <typename ElemType> __device__ ElemType NeutralValue(ElementWiseOperator op)
{
return 0; // error, only the explicit instantiations below should be used.
};
template<> __device__ float NeutralValue<float>(ElementWiseOperator op)
{
switch (op)
{
case ElementWiseOperator::opSum: return 0;
case ElementWiseOperator::opLogSum: return -INFINITY;
case ElementWiseOperator::opMin: return FLT_MAX;
case ElementWiseOperator::opMax: return FLT_MIN;
default: return 0; // error
}
};
template<> __device__ double NeutralValue<double>(ElementWiseOperator op)
{
switch (op)
{
case ElementWiseOperator::opSum: return 0;
case ElementWiseOperator::opLogSum: return -INFINITY;
case ElementWiseOperator::opMin: return DBL_MAX;
case ElementWiseOperator::opMax: return DBL_MIN;
default: return 0; // error
}
};
// ----------------------------------------------------------------------------
// Function to update an aggregate value for the specifed reduction operation
// ----------------------------------------------------------------------------
template<typename ReductionType, class ElemType> __device__ void UpdateAggregate(ReductionType& aggregate, ElemType val, ElementWiseOperator reductionOp)
{
switch (reductionOp)
{
case ElementWiseOperator::opSum:
aggregate += val;
break;
case ElementWiseOperator::opLogSum:
aggregate = OpLogSum(aggregate, val);
break;
case ElementWiseOperator::opMin:
if (val < aggregate)
aggregate = val;
break;
case ElementWiseOperator::opMax:
if (val > aggregate)
aggregate = val;
break;
}
};
// -----------------------------------------------------------------------
// function to compute the value for a given output location (including reduction)
// -----------------------------------------------------------------------
@ -272,12 +331,12 @@ template <class ElemType, C_size_t N, C_int M, C_int m>
struct TensorOpReduce
{
// this version for m >= 0
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op, ElementWiseOperator reductionOp,
const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides)
{
// start with index 0
// We may use 'double' since we are memory-bound anyway.
ReduceElemType aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
ReduceElemType aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reductionOp, reducingOpDims, reducingStrides);
// apply this index to the pointers
C_size_t dim = reducingOpDims[m];
for (C_size_t k = 1 /*done with k=0 already*/; k < dim; k++)
@ -285,8 +344,8 @@ struct TensorOpReduce
// bump the pointers
for (C_size_t i = 0; i < N - 1; i++) // N-1 because output is not used here
pointers[i] += reducingStrides(i, (C_size_t) m);
ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
aggregate += val;
ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reductionOp, reducingOpDims, reducingStrides);
UpdateAggregate<ReduceElemType, ElemType>(aggregate, val, reductionOp);
}
return (ElemType) aggregate;
}
@ -299,7 +358,7 @@ struct TensorOpReduce<ElemType, N, M, /*m=*/-1>
{
// this version for m = -1
// the pointers are pointing to the right location(s) to take the operation over
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op, ElementWiseOperator reductionOp,
const FixedArray<C_unsigned_int, M>& /*reducingOpDims*/, const FixedMatrix<C_int, N, M>& /*reducingStrides*/)
{
return TensorOps<ElemType>::Compute(pointers, op); // finally computing something!
@ -354,7 +413,7 @@ template <class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce, C_i
struct TensorOpElement
{
// template-recursive version loops over indices
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const FixedArray<C_unsigned_int, K>& regularOpStrides, const FixedMatrix<C_int, N, K>& regularStrides,
const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides,
CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
@ -367,7 +426,7 @@ struct TensorOpElement
for (C_size_t i = 0; i < N; i++)
pointers[i] += index * regularStrides(i, (C_size_t) k); // now this dimension is taken care of
// process the previous index
TensorOpElement<ElemType, N, M, K, parallelReduce, k - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
TensorOpElement<ElemType, N, M, K, parallelReduce, k - 1>::Compute(id, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
}
};
@ -376,7 +435,7 @@ template <class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce>
struct TensorOpElement<ElemType, N, M, K, parallelReduce, /*k=*/0>
{
// template-recursive version loops over indices
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const FixedArray<C_unsigned_int, K>& regularOpStrides, const FixedMatrix<C_int, N, K>& regularStrides,
const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides,
CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
@ -387,7 +446,7 @@ struct TensorOpElement<ElemType, N, M, K, parallelReduce, /*k=*/0>
for (C_size_t i = 0; i < N; i++)
pointers[i] += index * regularStrides(i, 0); // now this dimension is taken care of
// process the previous index
TensorOpElement<ElemType, N, M, K, parallelReduce, -1>::Compute(/*id*/ 0, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
TensorOpElement<ElemType, N, M, K, parallelReduce, -1>::Compute(/*id*/ 0, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
}
};
@ -397,13 +456,13 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/false, /*k=*/-1>
{
// template-recursion-teminating version computes the actual value for this output location
// now the output pointers point to the right element (input pointers may still iterate for reduction)
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const FixedArray<C_unsigned_int, K>& /*regularOpStrides*/, const FixedMatrix<C_int, N, K>& /*regularStrides*/,
const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides, CUDA_LONG /*reductionBegin*/, CUDA_LONG /*reductionChunkSize*/)
{
// compute the operation for this output coordinate
// This may still involve a reduction over inverse-broadcasting dimensions.
ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reductionOp, reducingOpDims, reducingStrides);
// scale
val *= alpha;
// combine with previous value in target matrix, then write it out
@ -423,7 +482,7 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
{
// template-recursion-teminating version computes the actual value for this output location
// now the output pointers point to the right element (input pointers may still iterate for reduction)
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const FixedArray<C_unsigned_int, K>& /*regularOpStrides*/, const FixedMatrix<C_int, N, K>& /*regularStrides*/,
const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides, CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
{
@ -442,22 +501,24 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
CUDA_LONG reductionEnd = min(reductionBegin + reductionChunkSize, reductionDim);
// compute the operation for this input coordinate
ReduceElemType sum = 0;
ReduceElemType aggregate = NeutralValue<ReduceElemType>(reductionOp);
for (CUDA_LONG redId = reductionBegin + tid; redId < reductionEnd; redId += tids)
{
auto val = TensorOpParallelReduce<ElemType, N, M, M - 1>::Compute(redId, pointers, op, reducingOpDims, reducingStrides);
sum += val;
UpdateAggregate<ReduceElemType, ElemType>(aggregate, val, reductionOp);
}
// reduce --cf https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf
__shared__ ReduceElemType volatile accumulators[GridDim::maxThreadsPerBlock /*tids*/];
accumulators[tid] = sum;
accumulators[tid] = aggregate;
__syncthreads();
static_assert(GridDim::maxThreadsPerBlock <= 512, "GridDim::maxThreadsPerBlock too large, need to add manually unrolled steps");
for (CUDA_LONG i = 256; i; i >>= 1)
static_assert(GridDim::maxThreadsPerBlock <= 1024, "GridDim::maxThreadsPerBlock too large, need to add manually unrolled steps");
for (CUDA_LONG i = 512; i; i >>= 1)
{
if (tid < i && tid + i < tids)
accumulators[tid] += accumulators[tid + i];
UpdateAggregate<volatile ReduceElemType, volatile ReduceElemType>(accumulators[tid], accumulators[tid + i], reductionOp);
if (0 + i < tids)
__syncthreads(); // sync if condition true for at least one thread
// TODO: use volatile* and then we can skip the __syncthreads() for the last 32 values. See Amit's allreduce() function implementation in MatrixQuantizer_kernel.cu.
@ -496,13 +557,13 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
// launch tensor op with CUDA
template <class ElemType, C_size_t N, C_int M, C_int K>
__global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
__global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides, CUDA_LONG numElements,
FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides)
{
CUDA_LONG id = GridDim::GetLinearThreadId();
if (id < numElements) // note: there are no __syncthread() calls inside
TensorOpElement<ElemType, N, M, K, false, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, 0, 0);
TensorOpElement<ElemType, N, M, K, false, K - 1>::Compute(id, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, 0, 0);
}
template <class ElemType, C_size_t N, C_int K>
@ -527,7 +588,7 @@ static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, Ele
CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual input element
SyncGuard syncGuard;
GridDim grid(NN);
_launchTensorOp<ElemType, N, /*M=*/0, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
_launchTensorOp<ElemType, N, /*M=*/0, K> <<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >>>(beta, pointers, alpha, op, (ElementWiseOperator)(-1) /* dummy reductionOp */, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
}
// -----------------------------------------------------------------------
@ -535,7 +596,7 @@ static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, Ele
// -----------------------------------------------------------------------
template <class ElemType, C_size_t N, C_int M, C_int K>
__global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
__global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides, CUDA_LONG numElements,
FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides,
CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
@ -546,7 +607,7 @@ __global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*
pointers[pointers.size() - 1] += numElements * reductionBlock; // the output tensor is dense (no gaps); and there is one copy for each reduction block (those get further reduced into one later)
#endif
if (id < numElements) // note: we have __syncthread() calls but only entire blocks in sync, so this is OK
TensorOpElement<ElemType, N, M, K, true, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
TensorOpElement<ElemType, N, M, K, true, K - 1>::Compute(id, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
}
// helper function to provide a reduction buffer
@ -582,7 +643,7 @@ static shared_ptr<ElemType> GetReductionBuffer(size_t N)
// All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
template <class ElemType, C_size_t N, C_int M, C_int K>
static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrideVectors,
const SmallVector<size_t>& reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N>& reducingStrideVectors)
{
@ -601,7 +662,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
// launch the kernel
CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual input element
CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual output element
SyncGuard syncGuard;
// do some optimization for reductions
@ -631,7 +692,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
{
// we got enough elements to generate: do one element per thread, and reduction inside
_launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
beta, pointers, alpha, op,
beta, pointers, alpha, op, reductionOp,
regularOpStrides, regularStrides, grid.m_N,
reducingOpDims, reducingStrides);
}
@ -684,9 +745,9 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
if (numReductionChunks == 1)
{
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
beta, pointers, alpha, op,
beta, pointers, alpha, op, reductionOp,
regularOpStrides, regularStrides, NN,
reducingOpDims, reducingStrides, 0, reductionChunkSize);
reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize);
}
// --- case (b)
// Reduction across blocks. This is the difficult one.
@ -721,7 +782,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
ElemType beta1 = 0;
ElemType alpha1 = 1;
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(
beta1, pointers1, alpha1, op,
beta1, pointers1, alpha1, op, reductionOp,
regularOpStrides, regularStrides1, NN,
reducingOpDims, reducingStrides, /*reductionBegin*/0, reductionChunkSize);
@ -738,14 +799,14 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
const array<SmallVector<ptrdiff_t>, 2> reducingStrideVectors2{ SmallVector<ptrdiff_t>{ NN }, SmallVector<ptrdiff_t>{ 0 } };
const SmallVector<size_t> reducingOpDimVector2{ (size_t)numReductionChunks };
LaunchTensorOpWithReduction<ElemType, /*N=*/2, /*M=*/1, K>(
beta, pointerVector2, alpha, ElementWiseOperator::opCopy,
beta, pointerVector2, alpha, ElementWiseOperator::opCopy, reductionOp,
regularOpDims, regularStrideVectors2,
reducingOpDimVector2, reducingStrideVectors2);
// (note: ^^this will have a nested syncGuard, which is fine)
#else
_launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
beta, pointers, alpha, op,
beta, pointers, alpha, op, reductionOp,
regularOpStrides, regularStrides, grid.m_N,
reducingOpDims, reducingStrides);
//for (size_t z = 0; z < numBlocksZ; z++)
@ -768,16 +829,16 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
else if (beta == 1)
{
// no need to pre-scale; just add (common for gradients)
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
return;
}
else
{
// We need more than one chunk, we will use atomicAdd().
// First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
// We will leave it like this for a while, but eventually need to revisit using temporary memory.
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(/*beta=*/1, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
}
#endif
}
@ -856,7 +917,7 @@ void LaunchUnaryTensorOp(ElemType beta, const ElemType* pa, ElemType* pb, ElemTy
// tensor operation with k+1 dimensions (-1 means scalar)
template <class ElemType, C_size_t N, C_int K>
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
{
@ -864,9 +925,9 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
switch (dims)
{
case 2:
return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1:
return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0:
return LaunchTensorOp<ElemType, N, K>(beta, pointers, alpha, op, regularOpDims, regularStrides);
default:
@ -877,7 +938,7 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
// tensor operation, generalized in number of arguments
// This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
template <class ElemType, C_size_t N>
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, N>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
@ -888,15 +949,15 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
switch (dims)
{
case 4:
return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 3:
return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 2:
return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1:
return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0:
return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
default:
LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (C_int) dims);
}
@ -906,27 +967,27 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
// explicit instantiations--these are being called from GPUMatrix.cu
//------------------------------------------------------------------------
template void TensorOpN<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op,
template void TensorOpN<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
template void TensorOpN<float, 3>(float beta, array<float*, 3> pointers, float alpha, ElementWiseOperator op,
template void TensorOpN<float, 3>(float beta, array<float*, 3> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
template void TensorOpN<float, 4>(float beta, array<float*, 4> pointers, float alpha, ElementWiseOperator op,
template void TensorOpN<float, 4>(float beta, array<float*, 4> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
template void TensorOpN<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op,
template void TensorOpN<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
template void TensorOpN<double, 3>(double beta, array<double*, 3> pointers, double alpha, ElementWiseOperator op,
template void TensorOpN<double, 3>(double beta, array<double*, 3> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
template void TensorOpN<double, 4>(double beta, array<double*, 4> pointers, double alpha, ElementWiseOperator op,
template void TensorOpN<double, 4>(double beta, array<double*, 4> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);

Просмотреть файл

@ -18,11 +18,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#define C_unsigned_int CUDA_LONG
template <class ElemType, C_size_t N>
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, N>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides);
template <class ElemType>
void LaunchUnaryTensorOp(ElemType beta, const ElemType* pa, ElemType* pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim);
} } }
}}}

Просмотреть файл

@ -175,6 +175,7 @@
<ClInclude Include="RNGHandle.h" />
<ClInclude Include="TensorOps.h" />
<ClInclude Include="TensorView.h" />
<ClInclude Include="Quantizers.h" />
<None Include="GPUWatcher.cu" />
<None Include="GPUWatcher.h">
<FileType>CppHeader</FileType>

Просмотреть файл

@ -123,6 +123,7 @@
<ClInclude Include="BlockMultiplierPlatform.h">
<Filter>CPU</Filter>
</ClInclude>
<ClInclude Include="Quantizers.h" />
</ItemGroup>
<ItemGroup>
<None Include="GPUMatrix.h">

Просмотреть файл

@ -1139,7 +1139,12 @@ template <>
/*static*/ char Matrix<char>::MakeNan(size_t)
{
return 0;
} // (needed for completeness)
} // (needed for completeness and to pass unit tests)
template <>
/*static*/ short Matrix<short>::MakeNan(size_t)
{
return 0;
} // (needed for completeness and to pass unit tests)
template <class ElemType>
void Matrix<ElemType>::MaskColumnsValue(const Matrix<char>& columnsMask, ElemType val)
@ -4289,7 +4294,8 @@ void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale,
}
template <class ElemType>
void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, double blendFactor,
const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const
{
DecideAndMoveToRightDevice(*this, grad);
@ -4297,10 +4303,10 @@ void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Ma
// REVIEW alexeyk: add sparse version.
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->BatchNormalizationBackward(*(in.m_CPUMatrix), *(grad.m_CPUMatrix), *(scale.m_CPUMatrix),
m_CPUMatrix->BatchNormalizationBackward(*(in.m_CPUMatrix), *(grad.m_CPUMatrix), *(scale.m_CPUMatrix), blendFactor,
*(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix),
*(scaleGrad.m_CPUMatrix), *(biasGrad.m_CPUMatrix)),
m_GPUMatrix->BatchNormalizationBackward(*(in.m_GPUMatrix), *(grad.m_GPUMatrix), *(scale.m_GPUMatrix),
m_GPUMatrix->BatchNormalizationBackward(*(in.m_GPUMatrix), *(grad.m_GPUMatrix), *(scale.m_GPUMatrix), blendFactor,
*(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix),
*(scaleGrad.m_GPUMatrix), *(biasGrad.m_GPUMatrix)),
NOT_IMPLEMENTED,
@ -5401,6 +5407,7 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const
NOT_IMPLEMENTED);
}
//template class Matrix<short>;
template class Matrix<float>;
template class Matrix<double>;
@ -5430,6 +5437,31 @@ template void Matrix<char>::Resize(const size_t numRows, const size_t numCols, c
template void Matrix<char>::Reshape(const size_t, const size_t);
template char* Matrix<char>::CopyToArray(void) const;
// Matrix<short> methods
template Matrix<short>::Matrix(DEVICEID_TYPE);
template Matrix<short>::Matrix(Matrix<short>&&);
template Matrix<short>::Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId, const MatrixType matrixType, const MatrixFormat matrixFormat);
template Matrix<short>::Matrix(const size_t numRows, const size_t numCols, short* pArray, DEVICEID_TYPE deviceId, const size_t matrixFlags, const size_t nnz);
template Matrix<short>::~Matrix();
template Matrix<short>& Matrix<short>::operator=(Matrix<short>&& moveFrom);
template short* Matrix<short>::Data() const;
template int Matrix<short>::GetDeviceId() const;
template size_t Matrix<short>::GetNumElements() const;
template Matrix<short> Matrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
template void Matrix<short>::_transferToDevice(int id_to, bool isBeingMoved, bool emptyTransfer) const;
template void Matrix<short>::TransferToDeviceIfNotThere(int id_to, bool isBeingMoved, bool emptyTransfer, bool updatePreferredDevice) const;
template size_t Matrix<short>::GetNumRows() const;
template size_t Matrix<short>::GetNumCols() const;
template void Matrix<short>::SetValue(const short);
template void Matrix<short>::SetValue(size_t numRows, const size_t numCols, int deviceId, short* pArray, size_t matrixFlags);
//template void Matrix<short>::SetValue(const Matrix<short>&, MatrixFormat);
template void Matrix<short>::SetValue(const Matrix<short>&);
template void Matrix<short>::AssignValuesOf(const Matrix<short>&);
template bool Matrix<short>::IsEmpty() const;
template void Matrix<short>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, bool growOnly);
template void Matrix<short>::Reshape(const size_t, const size_t);
template short* Matrix<short>::CopyToArray(void) const;
template Matrix<int>::Matrix(const size_t, const size_t, int*, DEVICEID_TYPE, const size_t, const size_t);
}}}

Просмотреть файл

@ -503,7 +503,7 @@ public:
void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
Matrix<ElemType>& runMean, Matrix<ElemType>& runInvStdDev, Matrix<ElemType>& out, double epsilon,
Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const;
void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, double blendFactor, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const;
public:

Просмотреть файл

@ -708,6 +708,7 @@ void GPUSparseMatrix<ElemType>::ConvertBuffer(OutType* outBuffer, const InType*
#pragma endregion Helper Functions
template class MATH_API GPUSparseMatrix<short>;
template class MATH_API GPUSparseMatrix<char>;
template class MATH_API GPUSparseMatrix<float>;
template class MATH_API GPUSparseMatrix<double>;
@ -1832,7 +1833,7 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
}
template <class ElemType>
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale,
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
{
@ -2216,6 +2217,7 @@ GPURNGHandle::GPURNGHandle(int deviceId, unsigned long seed)
#pragma endregion GPURNGHandle functions
template class GPUMatrix<short>;
template class GPUMatrix<char>;
template class GPUMatrix<float>;
template class GPUMatrix<double>;
@ -2276,6 +2278,9 @@ float CudaTimer::Elapsed()
return 0;
}
/*static*/ void SyncGuard::EnableSync()
{
}
} } }
// define a dummy GPUWatcher class too

106
Source/Math/Quantizers.h Normal file
Просмотреть файл

@ -0,0 +1,106 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "Basics.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// RawType - input type to the quantizer. Currently CNTK supports float or double as RawType.
// QuantizedType - output type of the quantizer
template <class RawType, class QuantizedType>
class QuantizerBase
{
public:
QuantizerBase()
{
rangeMax = std::numeric_limits<QuantizedType>::max();
}
virtual void Quantize(const ArrayRef<RawType>& input, ArrayRef<QuantizedType>& output) = 0;
virtual void Dequantize(const ArrayRef<QuantizedType>& input, ArrayRef<RawType>& output) = 0;
protected:
QuantizedType rangeMax;
};
// Symmetric quantizer.
// Quantization is achieved by
// 1. Finding the absolute max of values to be quantized.
// 2. Adjusting the absolute max with extraBits parameter.
// 3. Scaling all values in the collection to be within the symmetric range of the QuantizedType
template <class RawType, class QuantizedType>
class SymmetricQuantizer : public QuantizerBase<RawType, QuantizedType>
{
RawType m_quantizeFactor;
RawType m_inverseQuantizerFactor;
RawType m_absMax;
public:
// elements - collection to be quantized
// extraBits decreases the quantization normalizer to prevent integer overflow during BLAS routines.
// Higher extraBits will decrease precision of quantization, but will make BLAS routines less prone to overflow.
// For quantization with shorts, recommended value of extraBits is 1-3.
// This constructor accepts the collection of RawType to initialize internal quantizer
// and then apply this quantizer to collections with similar range as the one it was initialized with.
SymmetricQuantizer(const ArrayRef<RawType>& input, size_t extraBits)
{
m_absMax = FindAbsMax(input);
Initialize(m_absMax, extraBits);
}
// absoluteMax - the range of the quantizer (normally represents maximum absolute value of the values in the collection to be quantized).
// extraBits - see comment in another ctor
SymmetricQuantizer(RawType absoluteMax, size_t extraBits)
{
Initialize(absoluteMax, extraBits);
}
// Perform quantization of the input collection, put result into pre-allocated output collection
virtual void Quantize(const ArrayRef<RawType>& input, ArrayRef<QuantizedType>& output)
{
assert(input.size() == output.size());
for (size_t i = 0; i < input.size(); i++)
{
#ifdef _DEBUG
assert(abs(input[i]) <= m_absMax);
#endif
output[i] = (QuantizedType) round((input[i] * m_quantizeFactor));
}
}
// Accept quantized collection as input, put de-quantization result into pre-allocated output collection.
virtual void Dequantize(const ArrayRef<QuantizedType>& input, ArrayRef<RawType>& output)
{
assert(input.size() == output.size());
for (size_t i = 0; i < input.size(); i++)
{
output[i] = (RawType)(input[i] * m_inverseQuantizerFactor);
}
}
private:
// Find absolute maximum value
RawType FindAbsMax(const ArrayRef<RawType>& arrayRef)
{
RawType maxElem = *std::max_element(arrayRef.begin(), arrayRef.end());
RawType minElem = *std::min_element(arrayRef.begin(), arrayRef.end());
return std::max(maxElem, std::abs(minElem));
}
void Initialize(RawType absoluteMax, size_t extraBits)
{
RawType shiftedMax = absoluteMax * (1 << extraBits);
if (shiftedMax == 0)
{
LogicError("The absolute max element in the sequence to be quantized is 0.");
}
m_absMax = absoluteMax;
m_quantizeFactor = rangeMax / shiftedMax;
m_inverseQuantizerFactor = 1 / m_quantizeFactor;
}
};
}}}

Просмотреть файл

@ -261,6 +261,8 @@ DefTernaryOp(Cond, a ? b : c);
DefTernaryOp(CopyIfEqual, a == b ? c : 0); // CopyIfEqual(a,b)(c) -- if a==b copy c, otherwise 0; used for gradient of clip, min, max, etc.
DefTernaryOp(Clip, c < a ? a : (c > b ? b : c)); // Clip(min,max)(data) => a=min, b=max, c=data
DefTernaryOp(ElementwiseProductWithLogSumDerivative, a * Sigmoid(c - b));
DefTernaryOp(ElementwiseProductWithExpOfDiff, a * exp_(b - c));
#pragma pop_macro("DefTernaryOp")
}}}

Просмотреть файл

@ -14,6 +14,10 @@
#pragma warning(push)
#pragma warning(disable : 4251) // needs to have dll-interface to be used by clients of... caused by TensorView::m_shape which is only private. We use the same compiler everywhere.
namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
template <class ElemType> struct TensorTest;
}}}}
// This class is exported from the Math.dll.
namespace Microsoft { namespace MSR { namespace CNTK {
@ -149,6 +153,7 @@ private:
const Matrix<ElemType>& GetSOB() const { return *m_sob; }
Matrix<ElemType>& GetSOB() { return *m_sob; }
friend Test::TensorTest<ElemType>;
// -------------------------------------------------------------------
// sob members

Просмотреть файл

@ -46,7 +46,7 @@ CNTKTextFormatReader::CNTKTextFormatReader(MemoryProviderPtr provider,
{
// Verbosity is a general config parameter, not specific to the text format reader.
int verbosity = config(L"verbosity", 0);
m_randomizer = make_shared<BlockRandomizer>(verbosity, window, m_deserializer);
m_randomizer = make_shared<BlockRandomizer>(verbosity, window, m_deserializer, true);
}
else
{

Просмотреть файл

@ -100,7 +100,7 @@ CompositeDataReader::CompositeDataReader(const ConfigParameters& config, MemoryP
size_t randomizationWindow = config(L"randomizationWindow", requestDataSize);
// By default using STL random number generator.
bool useLegacyRandomization = config(L"useLegacyRandomization", false);
m_sequenceEnumerator = std::make_shared<BlockRandomizer>(verbosity, randomizationWindow, deserializer, BlockRandomizer::DecimationMode::chunk, useLegacyRandomization, multiThreadedDeserialization);
m_sequenceEnumerator = std::make_shared<BlockRandomizer>(verbosity, randomizationWindow, deserializer, true /* should Prefetch */, BlockRandomizer::DecimationMode::chunk, useLegacyRandomization, multiThreadedDeserialization);
}
else
{
@ -251,7 +251,7 @@ void CompositeDataReader::StartEpoch(const EpochConfiguration& cfg)
if (config.m_totalEpochSizeInSamples <= 0)
{
RuntimeError("Unsupported minibatch size '%d'.", (int)config.m_totalEpochSizeInSamples);
RuntimeError("Unsupported epoch size '%d'.", (int)config.m_totalEpochSizeInSamples);
}
m_sequenceEnumerator->StartEpoch(config);

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше