Merge branch 'master' into qiwye/multiverso

Conflicts:
	Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
	Source/SGDLib/SGDLib.vcxproj
	Source/SGDLib/SGDLib.vcxproj.filters
This commit is contained in:
Qiwei Ye 2016-01-07 16:57:09 +08:00
Родитель 641c75a751 40ce1afa4b
Коммит 6c2ee1aa51
63 изменённых файлов: 3457 добавлений и 2088 удалений

Просмотреть файл

@ -240,6 +240,7 @@ MATH_SRC =\
ifdef CUDA_PATH
MATH_SRC +=\
$(SOURCEDIR)/Math/GPUMatrix.cu \
$(SOURCEDIR)/Math/GPUTensor.cu \
$(SOURCEDIR)/Math/GPUSparseMatrix.cu \
$(SOURCEDIR)/Math/GPUWatcher.cu \
$(SOURCEDIR)/Math/MatrixQuantizerGPU.cu \

Просмотреть файл

@ -35,27 +35,32 @@ using namespace std;
;
wstring computationNodes = // TODO: use actual TypeName() here? would first need to make it a wide string; we should also extract those two methods into the base macro
L"LearnableParameter(rows, cols, needGradient = true, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' /*plus the function args*/ ]\n"
L"LearnableParameter(rows, cols, needGradient = true, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (rows : cols) ] /*plus the function args*/ ]\n"
L"Parameter = LearnableParameter // deprecated \n"
L"ParameterTensor(dims, needGradient = true, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
// ^^ already works; vv untested
L"Input(rows, cols, tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = false /*plus the function args*/ ]\n" // note: naming a little inconsistent // TODO: re-test after flag change
L"SparseInput(rows, cols, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = false /*plus the function args*/ ]\n"
L"ImageInput(imageWidth, imageHeight, imageChannels, numImages, tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]\n"
L"SparseImageInput(imageWidth, imageHeight, imageChannels, numImages, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]\n"
L"Input(dims, tag='feature') = new ComputationNode [ operation = 'InputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]\n" // note: naming a little inconsistent // TODO: re-test after flag change
L"SparseInput(dims, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]\n"
L"ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]\n"
L"SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]\n"
L"Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, needGradient = false, init = 'fixedValue', value = val) \n"
L"PastValue(rows, cols, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input /*plus the function args*/ ]\n"
L"FutureValue(rows, cols, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input /*plus the function args*/ ]\n"
L"PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
L"FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
// TODO: ^^ DelayedValues no longer need to know their dimension. That is inferred in Validation.
L"RowSlice(startIndex, numRows, input, needGradient = false, tag='') = new ComputationNode [ operation = 'RowSlice' ; inputs = input /*plus the function args*/ ]\n"
L"RowRepeat(input, numRepeats, needGradient = false, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]\n"
L"RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]\n"
L"Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input /*plus the function args*/ ]\n"
L"Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'DeprecatedReshape' ; inputs = input /*plus the function args*/ ]\n"
L"NewReshape(input, dims, beginDim=0, endDim=0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
L"ReshapeDimension(x, dim, tensorShape) = NewReshape(x, tensorShape, beginDim=dim, endDim=dim + 1) \n"
L"FlattenDimensions(x, dim, num) = NewReshape(x, 0, beginDim=dim, endDim=dim + num) \n"
L"SplitDimension(x, dim, N) = ReshapeDimension(x, dim, 0:N) \n"
L"Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability) /*plus the function args*/ ]\n"
L"WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]\n"
L"ReconcileMBLayout(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileMBLayout' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]\n"
L"Convolution(weightNode, inputValueNode, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode) /*plus the function args*/ ]\n"
L"MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]\n"
L"AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, tag='') = new ComputationNode [ operation = 'AveragePoolingNode' ; inputs = input /*plus the function args*/ ]\n"
L"Convolution(weightNode, inputValueNode, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode) /*plus the function args*/ ]\n"
L"MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]\n"
L"AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'AveragePooling' ; inputs = input /*plus the function args*/ ]\n"
// TODO: define DelayedValue, with negative delay for future; cannot do this yet, need to be able to say something like delay = -(^.delay)
// aliases
L"ColumnwiseCrossProduct = KhatriRaoProduct // deprecated \n" // TODO: should it be deprecated? It is described as easier to understand in the CNTKBook.

Просмотреть файл

@ -903,12 +903,12 @@ void DoTrain(const ConfigRecordType & config)
};
}
// legacy test mode for BrainScript. Will go away once we fully integrate with BS.
else if (config.Exists(L"ExperimentalNetworkBuilder"))
else if (config.Exists(L"BrainScriptNetworkBuilder") || config.Exists(L"ExperimentalNetworkBuilder"/*legacy*/))
{
// We interface with outer old CNTK config by taking the inner part, which we get as a string, as BrainScript.
// We prepend a few standard definitions, and also definition of deviceId and precision, which all objects will pull out again when they are being constructed.
// BUGBUG: We are not getting TextLocations right in this way! Do we need to inject location markers into the source? Moot once we fully switch to BS
wstring sourceCode = config(L"ExperimentalNetworkBuilder");
wstring sourceCode = config.Exists(L"BrainScriptNetworkBuilder") ? config(L"BrainScriptNetworkBuilder") : config(L"ExperimentalNetworkBuilder");
let expr = BS::ParseConfigDictFromString(standardFunctions + computationNodes + commonMacros
+ msra::strfun::wstrprintf(L"deviceId = %d ; precision = '%ls' ; network = new ComputationNetwork ", (int)deviceId, ElemTypeName<ElemType>()) // TODO: check if typeid needs postprocessing
+ sourceCode, vector<wstring>()); // source code has the form [ ... ]

Просмотреть файл

@ -158,7 +158,7 @@
<ClInclude Include="..\Common\Include\Basics.h" />
<ClInclude Include="..\Common\Include\BestGpu.h" />
<ClInclude Include="..\Common\Include\DataReader.h" />
<ClInclude Include="..\Common\Include\DataTensor.h" />
<ClInclude Include="..\Common\Include\TensorShape.h" />
<ClInclude Include="..\Common\Include\DataWriter.h" />
<ClInclude Include="..\Common\Include\File.h" />
<ClInclude Include="..\Common\Include\fileutil.h" />

Просмотреть файл

@ -133,7 +133,7 @@
<ClInclude Include="..\Common\Include\Sequences.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\DataTensor.h">
<ClInclude Include="..\Common\Include\TensorShape.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\ProgressTracing.h">

Просмотреть файл

@ -154,6 +154,8 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(LearnableParameter), L"Parameter"))
ret = true;
else if (EqualInsensitive(nodeType, L"ImageParameter"))
ret = true;
//else if (EqualInsensitive(nodeType, OperationNameOf(SparseLearnableParameter), L"SparseParameter"))
// ret = true;
else if (EqualInsensitive(nodeType, L"Constant", L"Const"))

Просмотреть файл

@ -30,29 +30,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
case SIMPLENET:
net = BuildSimpleDNN(); break;
case SIMPLERNN:
net = BuildSimpleRNN(1); break;
net = BuildSimpleRNN(); break;
case LSTM:
net = BuildLSTMNetworkFromDescription(1); break;
net = BuildLSTMNetworkFromDescription(); break;
case CLASSLSTM:
net = BuildCLASSLSTMNetworkFromDescription(1); break;
net = BuildCLASSLSTMNetworkFromDescription(); break;
case NCELSTM:
net = BuildNCELSTMNetworkFromDescription(1); break;
net = BuildNCELSTMNetworkFromDescription(); break;
case CLASSLM:
net = BuildClassEntropyNetwork(1); break;
net = BuildClassEntropyNetwork(); break;
case LBLM:
net = BuildLogBilinearNetworkFromDescription(1); break;
net = BuildLogBilinearNetworkFromDescription(); break;
case NPLM:
net = BuildNeuralProbNetworkFromDescription(1); break;
net = BuildNeuralProbNetworkFromDescription(); break;
case CLSTM:
net = BuildConditionalLSTMNetworkFromDescription(1); break;
net = BuildConditionalLSTMNetworkFromDescription(); break;
case RCRF:
net = BuildSeqTrnLSTMNetworkFromDescription(1); break;
net = BuildSeqTrnLSTMNetworkFromDescription(); break;
case LSTMENCODER:
net = BuildLSTMEncoderNetworkFromDescription(1); break;
net = BuildLSTMEncoderNetworkFromDescription(); break;
case UNIDIRECTIONALLSTM:
net = BuildUnidirectionalLSTMNetworksFromDescription(1); break;
net = BuildUnidirectionalLSTMNetworksFromDescription(); break;
case BIDIRECTIONALLSTM:
net = BuildBiDirectionalLSTMNetworksFromDescription(1); break;
net = BuildBiDirectionalLSTMNetworksFromDescription(); break;
default:
LogicError("BuildNetworkFromDescription: invalid m_rnnType %d", (int)m_rnnType);
}
@ -75,11 +75,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
switch (m_rnnType)
{
case ALIGNMENTSIMILARITYGENERATOR:
net = BuildAlignmentDecoderNetworkFromDescription(encoderNet, 1);
net = BuildAlignmentDecoderNetworkFromDescription(encoderNet);
net->CompileNetwork();
return net;
case ALIGNMENTSIMILARITYGFORWARDDECODER:
net = BuildAlignmentForwardDecoderNetworkFromDescription(encoderNet, 1);
net = BuildAlignmentForwardDecoderNetworkFromDescription(encoderNet);
net->CompileNetwork();
return net;
}
@ -95,12 +95,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
unsigned long randomSeed = 1;
size_t mbSize = 3; //this is not the actual minibatch size. only used in the validataion process
size_t numHiddenLayers = m_layerSizes.size() - 2;
ComputationNodePtr input, w, b, output, label, prior, scaledLogLikelihood;
input = builder.Input(m_layerSizes[0], mbSize, L"features");
input = builder.CreateInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
if (m_applyMeanVarNorm)
@ -114,9 +112,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (numHiddenLayers > 0)
{
w = builder.Parameter(m_layerSizes[1], m_layerSizes[0], L"W0");
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[0]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
b = builder.Parameter(m_layerSizes[1], 1, L"B0");
b = builder.CreateLearnableParameter(L"B0", m_layerSizes[1], 1);
output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, L"W0*features"), b, L"W0*features+B0"), 0, L"H1");
if (m_addDropoutNodes)
@ -133,9 +131,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
w = builder.Parameter(m_layerSizes[i + 1], m_layerSizes[i], nameOfW);
w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[i + 1], m_layerSizes[i]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
b = builder.Parameter(m_layerSizes[i + 1], 1, nameOfB);
b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[i + 1], 1);
output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, nameOfTimes), b, nameOfPlus), i, nameOfH);
if (m_addDropoutNodes)
@ -151,13 +149,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
wstring nameOfTimes = nameOfW + L"*" + nameOfPrevH;
wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
w = builder.Parameter(m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers], nameOfW);
w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
b = builder.Parameter(m_layerSizes[numHiddenLayers + 1], 1, nameOfB);
b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[numHiddenLayers + 1], 1);
output = builder.Plus(builder.Times(w, input, nameOfTimes), b, nameOfPlus);
m_net->RenameNode(output, L"HLast");
label = builder.Input(m_layerSizes[numHiddenLayers + 1], mbSize, L"labels");
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
AddTrainAndEvalCriterionNodes(output, label);
@ -188,7 +186,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Note: while ComputationNode and CompuationNetwork are (supposed to be) independent of ElemType, it is OK to keep this class dependent.
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSimpleRNN(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSimpleRNN()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -201,7 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr input, w, b, u, pastValue, output, label, prior;
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
if (m_applyMeanVarNorm)
@ -225,7 +223,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], mbSize, 1);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
/// unless there is a good algorithm to detect loops, use this explicit setup
output = ApplyNonlinearFunction(
builder.Plus(
@ -255,7 +253,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", i), m_layerSizes[i+1], m_layerSizes[i+1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], mbSize, 1);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], 1);
/// unless there is a good algorithm to detect loops, use this explicit setup
output = ApplyNonlinearFunction(
builder.Plus(
@ -279,7 +277,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
/*m_net->MatrixL2Reg(w , L"L1w");*/
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize);
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1]);
AddTrainAndEvalCriterionNodes(input, label, w, L"criterion", L"eval");
output = builder.Times(w, input, L"outputs");
@ -294,7 +292,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyNetwork(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyNetwork()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
@ -312,7 +310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (m_vocabSize != m_layerSizes[numHiddenLayers + 1])
RuntimeError("BuildClassEntropyNetwork : vocabulary size should be the same as the output layer size");
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
if (m_applyMeanVarNorm)
@ -335,7 +333,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], mbSize, 1);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
/// unless there is a good algorithm to detect loops, use this explicit setup
output = ApplyNonlinearFunction(
builder.Plus(
@ -364,7 +362,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", i), m_layerSizes[i+1], m_layerSizes[i+1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], mbSize, 1);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], 1);
/// unless there is a good algorithm to detect loops, use this explicit setup
output = ApplyNonlinearFunction(
builder.Plus(
@ -391,7 +389,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
/// the label is a dense matrix. each element is the word index
label = builder.CreateInputNode(L"labels", 4, mbSize);
label = builder.CreateInputNode(L"labels", 4);
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
@ -412,7 +410,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetworkFromDescription(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetworkFromDescription()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -428,7 +426,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr clslogpostprob;
ComputationNodePtr clsweight;
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
if (m_applyMeanVarNorm)
@ -461,13 +459,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (numHiddenLayers > 0)
{
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
/// previously used function. now uses LSTMNode which is correct and fast
input = output;
for (int i = 1 + offset; i < numHiddenLayers; i++)
{
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
if (m_addDropoutNodes)
input = builder.Dropout(output);
@ -477,7 +475,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
/// serve as a global bias term
gt = builder.CreateInputNode(L"binaryFeature", m_auxFeatDim, 1);
gt = builder.CreateInputNode(L"binaryFeature", m_auxFeatDim);
m_net->FeatureNodes().push_back(gt);
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"AuxTrans%d", 0),
m_layerSizes[numHiddenLayers], m_auxFeatDim);
@ -493,7 +491,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
/// the label is a dense matrix. each element is the word index
label = builder.CreateInputNode(L"labels", 4, mbSize);
label = builder.CreateInputNode(L"labels", 4);
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
@ -518,7 +516,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
the aligment node takes a variable length input and relates each element to a variable length output
*/
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet)
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -535,7 +533,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr clsweight;
ComputationNodePtr columnStride, rowStride;
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
if (m_lookupTableOrder > 0)
@ -577,9 +575,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i], m_layerSizes[i]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], mbSize, 1);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], 1);
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
// output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
// output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
/// alignment node to get weights from source to target
/// this aligment node computes weights of the current hidden state after special encoder ending symbol to all
@ -607,7 +605,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (; i < numHiddenLayers; i++)
{
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
if (m_addDropoutNodes)
input = builder.Dropout(output);
@ -625,7 +623,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
/// the label is a dense matrix. each element is the word index
label = builder.CreateInputNode(L"labels", 4, mbSize);
label = builder.CreateInputNode(L"labels", 4);
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
@ -645,7 +643,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet)
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -662,7 +660,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr clsweight;
ComputationNodePtr columnStride, rowStride;
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
if (m_lookupTableOrder > 0)
@ -704,9 +702,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i], m_layerSizes[i]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], mbSize, 1);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], 1);
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
// output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
// output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
/// alignment node to get weights from source to target
/// this aligment node computes weights of the current hidden state after special encoder ending symbol to all
@ -734,7 +732,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (; i < numHiddenLayers; i++)
{
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
if (m_addDropoutNodes)
input = builder.Dropout(output);
@ -752,7 +750,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
/// the label is a dense matrix. each element is the word index
label = builder.CreateInputNode(L"labels", 4, mbSize);
label = builder.CreateInputNode(L"labels", 4);
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
@ -775,7 +773,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFromDescription(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFromDescription()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -793,8 +791,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr ot=nullptr, it=nullptr, ft=nullptr, gt=nullptr, ct=nullptr, ht=nullptr;
ComputationNodePtr pastValueXI, pastValueXII, pastValueXIII, pastValueXIV;
// input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
// input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
input = builder.CreateInputNode(L"features", m_layerSizes[0]);
featin = input;
m_net->FeatureNodes().push_back(input);
@ -827,7 +825,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
while (ik <= m_maOrder)
{
pastValueXI =
builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, ik, msra::strfun::wstrprintf(L"pastValue%d", ik));
builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], ik, msra::strfun::wstrprintf(L"pastValue%d", ik));
pastValueXI->SetParameterUpdateRequired(false);
pastValueXI->AttachInputs(input);
//TODO: to figure out sparse matrix size
@ -855,7 +853,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"R%d", i+1), m_layerSizes[i+1], m_layerSizes[i+1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], mbSize, 1);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], 1);
output = builder.Plus(builder.Times(w, pastValue), input);
pastValue->AttachInputs(output);
@ -875,7 +873,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize);
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1]);
AddTrainAndEvalCriterionNodes(input, label, w);
output = builder.Times(w, input, L"outputs");
@ -892,7 +890,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNeuralProbNetworkFromDescription(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNeuralProbNetworkFromDescription()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -910,7 +908,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr ot = nullptr, it = nullptr, ft = nullptr, gt = nullptr, ct = nullptr, ht = nullptr;
ComputationNodePtr pastValueXI, pastValueXII, pastValueXIII, pastValueXIV;
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
if (m_applyMeanVarNorm)
@ -927,10 +925,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
bi = builder.CreateLearnableParameter(L"bi0", m_layerSizes[1], 1);
pastValueXI = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 1);
pastValueXII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 2);
pastValueXIII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 3);
pastValueXIV = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 4);
pastValueXI = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 1);
pastValueXII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 2);
pastValueXIII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 3);
pastValueXIV = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 4);
pastValueXI->AttachInputs(input);
pastValueXII->AttachInputs(input);
pastValueXIII->AttachInputs(input);
@ -996,7 +994,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", i), m_layerSizes[i+1], m_layerSizes[i+1]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
std::list<ComputationNodeBasePtr> recurrent_loop;
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], mbSize, 1);
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], 1);
output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), builder.Times(w, pastValue)), i);
pastValue->AttachInputs(output);
recur_idx++;
@ -1017,7 +1015,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
// b = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"B%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], 1);
label = builder.CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize);
label = builder.CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers+1]);
AddTrainAndEvalCriterionNodes(input, label, w);
output = builder.Times(w, input);
@ -1034,7 +1032,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildDirectConnect(unsigned long &randomSeed, size_t /*mbSize*/, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode)
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildDirectConnect(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode)
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
@ -1050,7 +1048,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr scalar = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"SV%d", i), 1, 1);
scalar->Value().SetValue((ElemType)0.01);
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
ComputationNodePtr scaled = builder.Scale(scalar, directOutput, msra::strfun::wstrprintf(L"S%d", i));
#else
ComputationNodePtr scaled = builder.ElementTimes(scalar, directOutput, msra::strfun::wstrprintf(L"S%d", i));
@ -1065,7 +1063,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs)
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponent(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs)
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
@ -1121,17 +1119,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t layer1 = outputDim;
pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
if(m_constInputGateValue)
{
//it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim, mbSize);
//it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim);
//it->SetParameterUpdateRequired(false);
//it->Value().SetValue(m_constInputGateValue);
it = nullptr;
@ -1241,7 +1239,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSeqTrnLSTMNetworkFromDescription(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSeqTrnLSTMNetworkFromDescription()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -1261,7 +1259,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = { nullptr };
ComputationNodePtr trans;
input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
if (m_applyMeanVarNorm)
@ -1297,7 +1295,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i+1)
{
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i] * (offset ? m_lookupTableOrder : 1), m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i] * (offset ? m_lookupTableOrder : 1), m_layerSizes[i + 1], input);
input = output;
recur_idx++;
@ -1326,7 +1324,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
trans->Value().SetValue((ElemType)1.0 / m_layerSizes[numHiddenLayers + 1]);
// m_net->InitLearnableParameters(trans, m_uniformInit, randomSeed++, m_initValueScale);
trans->SetParameterUpdateRequired(true);
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
AddTrainAndEvalCriterionNodes(output, label, nullptr, L"CRFTrainCriterion", L"CRFEvalCriterion", nullptr, trans);
input = output;
@ -1340,7 +1338,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCLASSLSTMNetworkFromDescription(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCLASSLSTMNetworkFromDescription()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -1356,7 +1354,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr clslogpostprob;
ComputationNodePtr clsweight;
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
if (m_applyMeanVarNorm)
@ -1389,13 +1387,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (numHiddenLayers > 0)
{
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
/// previously used function. now uses LSTMNode which is correct and fast
input = output;
for (int i = 1 + offset; i <numHiddenLayers; i++)
{
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
if (m_addDropoutNodes)
input = builder.Dropout(output);
@ -1411,7 +1409,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
/// the label is a dense matrix. each element is the word index
label = builder.CreateInputNode(L"labels", 4, mbSize);
label = builder.CreateInputNode(L"labels", 4);
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
@ -1482,7 +1480,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#endif
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescription(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescription()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -1502,9 +1500,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = { nullptr };
if (m_sparse_input)
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
else
input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
@ -1542,7 +1540,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
/// previously used function. now uses LSTMNode which is correct and fast
input = output;
outputFromEachLayer[offset + 1] = input;
@ -1553,7 +1551,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
// previously used function, now uses LSTMnode, which is fast and correct
recur_idx++;
@ -1580,7 +1578,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#ifdef DEBUG_DECODER
w->Value().SetValue((ElemType)0.01);
#endif
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
AddTrainAndEvalCriterionNodes(input, label, w);
output = builder.Times(w, input, L"outputs");
@ -1615,7 +1613,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
K. Yao, G. Zweig, "Sequence-to-sequence neural net models for grapheme-to-phoneme conversion, submitted to Interspeech 2015
*/
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMEncoderNetworkFromDescription(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMEncoderNetworkFromDescription()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
@ -1631,9 +1629,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr input, w, b, u, e, pastValue, output, label, prior;
if (m_sparse_input)
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
else
input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
@ -1669,14 +1667,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (numHiddenLayers > 0)
{
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
input = output;
i++;
for (; i<numHiddenLayers; i++)
{
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
if (m_addDropoutNodes)
input = builder.Dropout(output);
@ -1705,7 +1703,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
K. Yao, G. Zweig, "Sequence-to-sequence neural net models for grapheme-to-phoneme conversion" submitted to Interspeech 2015
*/
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildUnidirectionalLSTMNetworksFromDescription(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildUnidirectionalLSTMNetworksFromDescription()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -1726,11 +1724,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
map<wstring, size_t> featDim;
assert(m_streamSizes.size() > 0);
inputbackward = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0], mbSize);
inputbackward = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0]);
m_net->FeatureNodes().push_back(inputbackward);
featDim[L"featurepastValueedTarget"] = m_streamSizes[0];
inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1], mbSize);
inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1]);
m_net->FeatureNodes().push_back(inputletter);
featDim[L"ltrForward"] = m_streamSizes[1];
@ -1777,7 +1775,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
switch (m_rnnType){
case UNIDIRECTIONALLSTM:
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx, dims, m_layerSizes[layerIdx + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx, dims, m_layerSizes[layerIdx + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx, dims, m_layerSizes[layerIdx + 1], input);
break;
default:
LogicError("This is for unidorectional LSTM model. Check rnntype to see whether it is UNIDIRECTIONALLSTMWITHPASTPREDICTION or TRANSDUCER");
@ -1797,7 +1795,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
input = output;
/// here uses "labels", so only one label from multiple stream inputs are used.
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
AddTrainAndEvalCriterionNodes(input, label, w);
@ -1819,7 +1817,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t mbSize, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse)
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse)
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
@ -1896,17 +1894,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t layer1 = outputDim;
pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
if (m_constInputGateValue)
{
//it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim, mbSize);
//it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim);
//it->SetParameterUpdateRequired(false);
//it->Value().SetValue(m_constInputGateValue);
it = nullptr;
@ -2026,7 +2024,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
K. Yao, G. Zweig, "Sequence-to-sequence neural net models for grapheme-to-phoneme conversion, submitted to Interspeech 2015
*/
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildBiDirectionalLSTMNetworksFromDescription(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildBiDirectionalLSTMNetworksFromDescription()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -2049,10 +2047,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t ltrSrcIdx = 1;
/// create projections to use pastValue predictions
inputprediction = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0], mbSize);
inputprediction = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0]);
m_net->FeatureNodes().push_back(inputprediction);
inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1], mbSize);
inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1]);
m_net->FeatureNodes().push_back(inputletter);
featDim[L"ltrForward"] = m_streamSizes[1];
@ -2100,12 +2098,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
/// forward direction
//forwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 100, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 100, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 100, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
forwardInput = forwardOutput;
backwardInput = (ComputationNodePtr)builder.TimeReverse(ltrSource);
//backwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 200, ltrDim, m_layerSizes[layerIdx + 1], backwardInput);
backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 200, ltrDim, m_layerSizes[layerIdx + 1], backwardInput);
backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 200, ltrDim, m_layerSizes[layerIdx + 1], backwardInput);
backwardInput = backwardOutput;
layerIdx++;
@ -2113,11 +2111,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
while (layerIdx < numHiddenLayers - 1)
{
//forwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 100, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], forwardInput);
forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 100, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], forwardInput);
forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 100, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], forwardInput);
forwardInput = forwardOutput;
//backwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 200, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], backwardInput);
backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 200, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], backwardInput);
backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 200, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], backwardInput);
backwardInput = backwardOutput;
layerIdx++;
@ -2137,7 +2135,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
forwardInput = (ComputationNodePtr)builder.Parallel(streams[0], streams[1], L"Parallel1");
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
input = output;
layerIdx++;
@ -2150,7 +2148,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
input = output;
/// here uses "labels", so only one label from multiple stream inputs are used.
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
AddTrainAndEvalCriterionNodes(input, label);
@ -2174,7 +2172,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDescription(size_t mbSize)
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDescription()
{
ComputationNetworkBuilder<ElemType> builder(*m_net);
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -2190,7 +2188,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr bias;
ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = { nullptr };
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
m_net->FeatureNodes().push_back(input);
if (m_applyMeanVarNorm)
@ -2222,7 +2220,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int offset = m_lookupTableOrder > 0 ? 1 : 0;
if (numHiddenLayers > 0)
{
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
input = output;
outputFromEachLayer[offset + 1] = input;
@ -2230,7 +2228,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i)
{
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
recur_idx++;
}
@ -2254,7 +2252,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (size_t i = offset; i < m_layerSizes.size(); i++)
{
/// add direct connect from each layers' output to the layer before the output layer
output = BuildDirectConnect(randomSeed, mbSize, i, (i > 1) ? m_layerSizes[i] : ((offset == 0) ? m_layerSizes[i] : m_layerSizes[i] * m_lookupTableOrder), m_layerSizes[numHiddenLayers], outputFromEachLayer[i], input);
output = BuildDirectConnect(randomSeed, i, (i > 1) ? m_layerSizes[i] : ((offset == 0) ? m_layerSizes[i] : m_layerSizes[i] * m_lookupTableOrder), m_layerSizes[numHiddenLayers], outputFromEachLayer[i], input);
if (output != nullptr)
input = output;
}
@ -2266,7 +2264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
/// the label is a dense matrix. each element is the word index
label = builder.CreateInputNode(L"labels", 2 * (this->nce_noises + 1), mbSize);
label = builder.CreateInputNode(L"labels", 2 * (this->nce_noises + 1));
bias = builder.CreateLearnableParameter(L"BiasVector", 1, m_layerSizes[m_layerSizes.size() - 1]);
bias->Value().SetValue((ElemType)-std::log(m_layerSizes[m_layerSizes.size() - 1]));
@ -2301,7 +2299,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr input, w, b, output, label, prior, scaledLogLikelihood;
shared_ptr<PreComputedNode<ElemType>> pcNodePtr;
size_t mbSize = 3; //this is not the actual minibatch size. only used in the validataion process
File fstream(dbnModelFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
@ -2336,7 +2333,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Matrix<ElemType> A = ReadMatrixFromDbnFile(fstream, std::string("b"));
if (i == 0)
{
input = builder.Input(wts.GetNumCols(), mbSize, L"features");
input = builder.CreateInputNode(L"features", wts.GetNumCols());
m_net->FeatureNodes().push_back(input);
size_t frameDim = globalMean.GetNumRows();
@ -2381,10 +2378,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
w = builder.Parameter(wts.GetNumRows(), wts.GetNumCols(), nameOfW);
w = builder.CreateLearnableParameter(nameOfW, wts.GetNumRows(), wts.GetNumCols());
w->Value().SetValue(wts);
b = builder.Parameter(bias.GetNumRows(), 1, nameOfB);
b = builder.CreateLearnableParameter(nameOfB, bias.GetNumRows(), 1);
b->Value().SetValue(bias);
if (layerType == "perceptron")
@ -2412,7 +2409,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
RuntimeError("Error reading DBN file - did not find expected tag ENET\n");
//size_t outputLayerSize = m_layerSizes[m_layerSizes.size()-1];
label = builder.Input(m_outputLayerSize, mbSize, L"labels");
label = builder.CreateInputNode(L"labels", m_outputLayerSize);
if (layerType == "perceptron") // complete network
{
@ -2446,9 +2443,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
w = builder.Parameter(outputLayerSize, penultimateSize, nameOfW);
w = builder.CreateLearnableParameter(nameOfW, outputLayerSize, penultimateSize);
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
b = builder.Parameter(outputLayerSize, 1, nameOfB);
b = builder.CreateLearnableParameter(nameOfB, outputLayerSize, 1);
output = builder.Plus(builder.Times(w, input, nameOfTimes), b, nameOfPlus);
m_net->RenameNode(output, L"HLast");

Просмотреть файл

@ -256,41 +256,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNetworkPtr BuildSimpleDNN();
ComputationNetworkPtr BuildSimpleRNN(size_t mbSize = 1);
ComputationNetworkPtr BuildSimpleRNN();
ComputationNetworkPtr BuildClassEntropyNetwork(size_t mbSize = 1);
ComputationNetworkPtr BuildClassEntropyNetwork();
ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);
ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);
ComputationNodePtr BuildLSTMNodeComponent(ULONG &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);
ComputationNodePtr BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t mbSize, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse = false);
ComputationNodePtr BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse = false);
ComputationNodePtr BuildDirectConnect(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode);
ComputationNodePtr BuildDirectConnect(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode);
ComputationNetworkPtr BuildLogBilinearNetworkFromDescription(size_t mbSize = 1);
ComputationNetworkPtr BuildLogBilinearNetworkFromDescription();
ComputationNetworkPtr BuildNeuralProbNetworkFromDescription(size_t mbSize = 1);
ComputationNetworkPtr BuildNeuralProbNetworkFromDescription();
ComputationNetworkPtr BuildLSTMNetworkFromDescription(size_t mbSize = 1);
ComputationNetworkPtr BuildLSTMNetworkFromDescription();
ComputationNetworkPtr BuildSeqTrnLSTMNetworkFromDescription(size_t mbSize = 1);
ComputationNetworkPtr BuildSeqTrnLSTMNetworkFromDescription();
ComputationNetworkPtr BuildLSTMEncoderNetworkFromDescription(size_t mbSize = 1);
ComputationNetworkPtr BuildLSTMEncoderNetworkFromDescription();
ComputationNetworkPtr BuildUnidirectionalLSTMNetworksFromDescription(size_t mbSize = 1);
ComputationNetworkPtr BuildUnidirectionalLSTMNetworksFromDescription();
ComputationNetworkPtr BuildBiDirectionalLSTMNetworksFromDescription(size_t mbSize = 1);
ComputationNetworkPtr BuildBiDirectionalLSTMNetworksFromDescription();
ComputationNetworkPtr BuildCLASSLSTMNetworkFromDescription(size_t mbSize = 1);
ComputationNetworkPtr BuildCLASSLSTMNetworkFromDescription();
ComputationNetworkPtr BuildConditionalLSTMNetworkFromDescription(size_t mbSize = 1);
ComputationNetworkPtr BuildConditionalLSTMNetworkFromDescription();
ComputationNetworkPtr BuildNCELSTMNetworkFromDescription(size_t mbSize = 1);
ComputationNetworkPtr BuildNCELSTMNetworkFromDescription();
ComputationNetworkPtr BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize = 1);
ComputationNetworkPtr BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet);
ComputationNetworkPtr BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize = 1);
ComputationNetworkPtr BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet);
//layer is 0 based
ComputationNodePtr ApplyNonlinearFunction(ComputationNodePtr input, const size_t layer, const std::wstring nodeName = L"");

Просмотреть файл

@ -15,9 +15,12 @@
#include "ConvolutionalNodes.h"
#include "NonlinearityNodes.h"
#include "ReshapingNodes.h"
#include "TensorShape.h"
namespace Microsoft { namespace MSR { namespace CNTK {
using namespace std;
template<class ElemType>
void SynchronousNodeEvaluator<ElemType>::Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass)
{
@ -58,48 +61,34 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
if (OperationNameOf(InputValue) == cnNodeType)
if (OperationNameOf(InputValue) == cnNodeType || OperationNameOf(SparseInputValue) == cnNodeType)
{
if (parameter.size() < 1 || parameter.size() > 2)
RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
bool isSparse = (OperationNameOf(SparseInputValue) == cnNodeType);
if (parameter.size() < 1)
RuntimeError("%ls should have 1 or more parameters (tensor dimensions, e.g. [vecdim] or [rows, cols]).", cnNodeType.c_str());
if (pass == ndlPassInitial)
{
// evaluate only scalar parameters
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
size_t i = 0;
auto tensorShape = ProcessTensorShapeParameters(node, params, i, /*isImage=*/false, cnNodeType);
// first look for this node already existing in the network
// BUGBUG: How does this set the dimensions then?
if (m_net->NodeNameExists(name))
nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
else if (isSparse)
nodePtr = builder.CreateSparseInputNode(name, tensorShape);
else
nodePtr = builder.CreateInputNode(name, rows, cols);
nodePtr = builder.CreateInputNode (name, tensorShape);
}
}
else if (OperationNameOf(SparseInputValue) == cnNodeType)
else if (cnNodeType == L"ImageInput" || cnNodeType == L"SparseImageInput")
{
if (parameter.size() < 1 || parameter.size() > 2)
RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
if (pass == ndlPassInitial)
{
// evaluate only scalar parameters
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
// first look for this node already existing in the network
if (m_net->NodeNameExists(name))
nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
else
nodePtr = builder.CreateSparseInputNode(name, rows, cols);
}
}
else if (cnNodeType == L"ImageInput")
{
if (parameter.size() < 3 || parameter.size() > 4)
RuntimeError("%ls should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
bool isSparse = (cnNodeType == L"SparseImageInput");
if (parameter.size() < 3 || parameter.size() > 4) // we allow 4 for legacy (numImages, was ignored)
RuntimeError("%ls should have 3 parameters[imageWidth, imageHeight, imageChannels].", cnNodeType.c_str());
if (pass == ndlPassInitial)
{
@ -108,44 +97,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
nodePtr = builder.CreateInputNode(name, ImageLayoutWHC(imageWidth, imageHeight, imageChannels), numImages);
if (isSparse)
nodePtr = builder.CreateSparseInputNode(name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind));
else
nodePtr = builder.CreateInputNode (name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind));
}
}
else if (cnNodeType == L"SparseImageInput")
else if (OperationNameOf(LearnableParameter) == cnNodeType || cnNodeType == L"ImageParameter")
{
if (parameter.size() < 3 || parameter.size() > 4)
RuntimeError("%ls should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
bool isImage = (cnNodeType == L"ImageParameter");
if (!isImage)
{
if (parameter.size() < 1)
RuntimeError("%ls should have 1 or more parameters (tensor dimensions, e.g. [vecdim] or [rows, cols]) plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
}
else
{
if (parameter.size() < 3)
RuntimeError("%ls should have 3 parameters [imageWidth, imageHeight, imageChannels] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
}
if (pass == ndlPassInitial)
{
// evaluate only scalar parameters
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
nodePtr = builder.CreateSparseInputNode(name, ImageLayoutWHC(imageWidth, imageHeight, imageChannels), numImages);
}
}
else if (OperationNameOf(LearnableParameter) == cnNodeType)
{
if (parameter.size() < 1 || parameter.size() > 2)
RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
if (pass == ndlPassInitial)
{
// evaluate only scalar parameters
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
size_t i = 0;
auto tensorShape = ProcessTensorShapeParameters(node, params, i, isImage, cnNodeType);
if (isImage)
tensorShape.AppendInPlace(3, 1); // this goes into the column dimension
bool needGradient = node->GetOptionalParameter("needGradient", "true");
nodePtr = builder.CreateLearnableParameter(name, rows, cols);
nodePtr = builder.CreateLearnableParameter(name, tensorShape);
nodePtr->SetParameterUpdateRequired(needGradient);
}
else if (pass == ndlPassFinal)
@ -305,7 +289,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
nodePtr->SetParameterUpdateRequired(needGradient);
}
}
else if (cnNodeType == OperationNameOf(ReshapeNode))
else if (cnNodeType == L"Reshape"/*OperationNameOf(ReshapeNode)*/)
{
if (parameter.size() < 2 || parameter.size() > 5)
RuntimeError("Reshape should have two to five parameters. Usage: Reshape(origNodeName, numRows, [imageWidth=], [imageHeight=], [imageChannels=].");
@ -323,18 +307,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t img_channels = node->GetOptionalParameter("imageChannels", "0");
bool needGradient = node->GetOptionalParameter("needGradient", "false");
nodePtr = builder.Reshape(NULL, num_rows, ImageLayoutWHC(img_width, img_height, img_channels), name);
nodePtr = builder.DeprecatedReshape(NULL, num_rows, ImageDimensions::AsTensorShape(img_width, img_height, img_channels, ImageLayoutKind::HWC/*legacy*/), name); // BUGBUG: use a tensor descriptor instead
nodePtr->SetParameterUpdateRequired(needGradient);
}
}
else if (cnNodeType == OperationNameOf(PastValueNode) ||
cnNodeType == OperationNameOf(FutureValueNode))
{
if (parameter.size() <2 || parameter.size() >3)
RuntimeError("PastValue or FutureValue should have two to three fixed parameters. Usage: PastValue(rows, [cols], m, [timeStep=1, defaultPastValue=0.1]).");
if (parameter.size() < 2 || parameter.size() > 3) // we allow 3 for legacy (cols parameter which is now unused)
RuntimeError("PastValue or FutureValue should have two to three fixed parameters. Usage: PastValue(rows, input, [timeStep=1, defaultPastValue=0.1]).");
// TODO: allow a tensor descriptor. Or allow 0 (inference). Maybe already supported--check this.
nodeParamCount = 1;
nodeParamStart = parameter.size() > 2?2:1;
nodeParamCount = 1; // number of inputs
nodeParamStart = parameter.size() > 2?2:1; // index of input
if (pass == ndlPassInitial)
{
@ -342,24 +327,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
// if we have three parameters the second is columns
size_t cols = parameter.size() > 2 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
// ignore legacy size_t cols = parameter.size() > 2 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
bool needGradient = node->GetOptionalParameter("needGradient", "false");
//bool needGradient = node->GetOptionalParameter("needGradient", "false"); // TODO: what's this for?
float defaultHiddenActivity = node->GetOptionalParameter("defaultHiddenActivity", "0.1"); // TODO: parameter should be called 'defaultHiddenActivation'
//for backward compatibility we check timeStep first
// for backward compatibility we check 'timeStep' first
size_t timeStep = node->GetOptionalParameter("timeStep", "1");
if (timeStep == 1)
{
timeStep = node->GetOptionalParameter("delayTime", "1");
}
if (cnNodeType == OperationNameOf(PastValueNode))
nodePtr = builder.PastValue(NULL, defaultHiddenActivity, rows, cols, timeStep, name);
nodePtr = builder.PastValue(NULL, defaultHiddenActivity, rows, timeStep, name);
else
nodePtr = builder.FutureValue(NULL, defaultHiddenActivity, rows, cols, timeStep, name);
nodePtr = builder.FutureValue(NULL, defaultHiddenActivity, rows, timeStep, name);
nodePtr->SetParameterUpdateRequired(needGradient); // TODO: what's this for?
//nodePtr->SetParameterUpdateRequired(needGradient); // TODO: what's this for?
}
}
else if (cnNodeType == OperationNameOf(ConvolutionNode))
@ -383,16 +366,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t outputChannels = ((NDLNode<ElemType>*)params[id++])->GetScalar();
size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
assert (id == 5);
//optional
// optional
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
nodePtr = builder.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
horizontalSubsample, verticalSubsample, zeroPadding, name, maxTempMemSizeInSamples);
horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding, maxTempMemSizeInSamples, name);
}
}
else if (cnNodeType == OperationNameOf(MaxPoolingNode))
@ -415,11 +397,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
assert (id == 4);
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
nodePtr = builder.MaxPooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight,
horizontalSubsample, verticalSubsample, name);
horizontalSubsample, verticalSubsample, imageLayoutKind, name);
}
}
else if (cnNodeType == OperationNameOf(AveragePoolingNode))
@ -442,11 +425,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
assert(id == 4);
assert (id == 4);
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
nodePtr = builder.AveragePooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight,
horizontalSubsample, verticalSubsample, name);
horizontalSubsample, verticalSubsample, imageLayoutKind, name);
}
}
else if (cnNodeType == OperationNameOf(BatchNormalizationNode))
@ -543,6 +527,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
// ProcessTensorShapeParameters - assume positional parameters starting from position i are tensor dimensions--parse those.
// Is isImage then must be a 3D tensor, which is interpreted as (W,H,C), and optional parameter 'imageLayout' says how.
template<class ElemType>
TensorShape SynchronousNodeEvaluator<ElemType>::ProcessTensorShapeParameters(const NDLNode<ElemType>* node, const vector<void*> & params, size_t & i, bool isImage, const wstring & cnNodeType/*for error messages only*/)
{
// gather dims
vector<size_t> dims;
dims.push_back(((NDLNode<ElemType>*)params[i])->GetScalar()); // first is mandatory
for (i++; i < params.size(); i++)
dims.push_back(((NDLNode<ElemType>*)params[i])->GetScalar());
// turn into tensor
TensorShape tensorShape(dims);
// if image then interpret as W, H, C with layout according to optional imageLayout parameter
if (isImage)
{
if (dims.size() != 3)
RuntimeError("%ls should have 3 parameters [width, height, numChannels].", cnNodeType.c_str());
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
tensorShape = ImageDimensions::AsTensorShape(tensorShape[0], tensorShape[1], tensorShape[2], imageLayoutKind);
}
return tensorShape;
}
template class SynchronousExecutionEngine<float>;
template class SynchronousExecutionEngine<double>;

Просмотреть файл

@ -290,7 +290,7 @@ public:
{
fprintf(stderr, "'multiSeq' tag is defunct.\n");
}
else if (!_strnicmp(value.c_str(), "eval", 4)) // only compare the first 4 characters
else if (!_strnicmp(value.c_str(), "eval", 4)) // only compare the first 4 characters. Yikes!!
{
SetOutputNode(m_net->EvaluationNodes(), compNode);
}
@ -326,9 +326,10 @@ public:
return nullptr;
}
virtual ~SynchronousNodeEvaluator()
{
}
virtual ~SynchronousNodeEvaluator() { }
protected:
TensorShape ProcessTensorShapeParameters(const NDLNode<ElemType>* node, const vector<void*> & params, size_t & i, bool isImage, const wstring & cnNodeType/*for error messages only*/);
private:
ComputationNetworkPtr m_net;

Просмотреть файл

@ -489,7 +489,7 @@ namespace Microsoft { namespace MSR { namespace ScriptableObjects {
std::vector<C> res;
res.reserve(GetSize(Fail));
for (const auto & val : values)
res.push_back(val);
res.push_back(val.ResolveValue()); // resolve upon access
return res;
}
};

Просмотреть файл

@ -196,7 +196,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
LogicError("AddSequence: Sequence added to an MBLayout must overlap with minibatch.");
// remember it
#ifdef _DEBUG
#if 0//def _DEBUG
auto cap = m_sequences.capacity(); // Some sanity check for debugging a speed regression. This should only show up during the first minibatches, and growing only.
m_sequences.push_back(seqDesc);
if (cap != m_sequences.capacity())

Просмотреть файл

@ -1,6 +1,6 @@
// DataTensor.h -- tensor descriptor that describes the inner structure of data vectors
// TensorShape.h -- tensor descriptor that describes the inner structure of data vectors
//
// <copyright file="Sequences.h" company="Microsoft">
// <copyright file="TensorShape.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
@ -90,6 +90,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
T m_data[12];
size_t m_size;
#ifdef _DEBUG
void DebugWipe() { memset(m_data, 0, sizeof(m_data)); } // initialize to 0 to make it look prettier in the debugger
#else
void DebugWipe() { }
#endif
public:
size_t capacity() const { return _countof(m_data); }
size_t size() const { return m_size; }
@ -103,12 +108,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ITER>
void assign(ITER beg, const ITER & end) { clear(); append(beg,end); }
void operator=(const SmallVector & other) { m_size = other.m_size; memcpy(m_data, other.m_data, other.m_size * sizeof(T)); }
SmallVector(const SmallVector & other) { *this = other; }
SmallVector(size_t sz, const T & val) { assign(sz, val); }
SmallVector(const SmallVector & other) { DebugWipe(); *this = other; }
SmallVector(size_t sz, const T & val) { DebugWipe(); assign(sz, val); }
SmallVector(size_t sz) : SmallVector(sz, 0) { }
SmallVector() : SmallVector(0) { }
SmallVector(const std::vector<T> & v) { assign(v.begin(), v.end()); }
SmallVector(const std::initializer_list<T> & l) { assign(l.begin(), l.end()); }
SmallVector(const std::vector<T> & v) { DebugWipe(); assign(v.begin(), v.end()); }
SmallVector(const std::initializer_list<T> & l) { DebugWipe(); assign(l.begin(), l.end()); }
bool operator==(const SmallVector & other) const { return size() == other.size() && !memcmp(data(), other.data(), other.m_size * sizeof(T)); }
bool operator!=(const SmallVector & other) const { return !operator==(other); } // duh
T operator[](size_t i) const { if (i >= size()) LogicError("SmallVector: index overflow"); return m_data[i]; }
@ -203,28 +208,28 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
void Load(File& fstream)
void Load(File& fstream, bool acceptLegacyFormat = false)
{
// format: uint32_t n, dim[0], dim[1], ..., dim[n-1]
// We are also able to read (but not write) an older format, which stores 3-dimensional tensors as size_t W, H, C
uint32_t n, dim;
fstream >> n >> dim;
if (dim) // heuristic to detect the old format. Old format stores a size_t, i.e. the second uint32_t is 0 (no dimensions are > 4G)
uint32_t rank, dim0;
fstream >> rank >> dim0;
if (!acceptLegacyFormat || dim0 != 0) // heuristic to detect the old format. Old format stores a size_t, i.e. the second uint32_t is 0 (no dimensions are > 4G)
{
m_dims.resize(n);
m_dims[0] = dim;
for (size_t i = 1; i < n; i++)
m_dims.resize(rank);
m_dims[0] = dim0;
for (size_t i = 1; i < rank; i++)
{
fstream >> dim;
m_dims[i] = dim;
fstream >> dim0;
m_dims[i] = dim0;
}
assert(n == m_dims.size());
assert(rank == m_dims.size());
}
else // detected the old size_t W, H, C format
{
m_dims.resize(3); // current format is hard-coded for 3, for back compat
m_dims[1] = n;
fstream >> m_dims[2] >> m_dims[0]; // currently stored in order W, H, C. TODO: general tensor format will be different
m_dims.resize(3);
m_dims[1] = rank;
fstream >> m_dims[2] >> m_dims[0]; // stored in order C, W, H
}
InitAsNoSlice();
}
@ -243,13 +248,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const SmallVector<size_t> & GetDims() const { return m_dims; } // get all, e.g. for logging or for constructing derived tensors with edited dimensions
const SmallVector<ptrdiff_t> & GetStrides() const { return m_strides; }
// interpretation as an image tensor
size_t GetNumChannels() const { if (m_dims.empty()) return 0; else return m_dims.size() > 0 ? m_dims[0] : 1; }
size_t GetWidth() const { if (m_dims.empty()) return 0; else return m_dims.size() > 1 ? m_dims[1] : 1; }
size_t GetHeight() const { if (m_dims.empty()) return 0; else return m_dims.size() > 2 ? m_dims[2] : 1; }
// heuristics used for pretty-printing
// TODO: This will go away.
bool IsInputAnImage() const { return GetRank() == 3 && (GetWidth() != 1 || GetNumChannels() != 1); }
// legacy helper function for RowSliceNode. Will go away.
bool IsVectorStoredAsImage() const { return GetRank() == 3 && m_dims[0] == 1 && m_dims[1] == 1; }
// indexing
@ -316,8 +315,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// m_dims = I 1 J K
// m_strides = 1 I I I*J
// dropping the second dimension
// m_dims = I % J K
// m_strides = 1 % I I*J
// m_dims = I J K
// m_strides = 1 I I*J
m_dims[j] = m_dims[k];
m_strides[j] = m_strides[k];
j++;
@ -442,15 +441,61 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// TODO: Does the same trick work for 2D images?
};
// When constructing an image tensor with the usual W, H, C format, use the following function instead.
// This will sort the three parameters into the correct order.
// BUGBUG: at several places, a comment says "after multiplication the structure is lost" and the vector dimension
// is set as the image height. However, the image height is actually the wrong dimension since images are assumed transposed.
// This will get fixed once we get more complete arbitrary tensor support throughout, including better-defined inference rules.
static inline TensorShape ImageLayoutWHC(size_t width, size_t height, size_t channels)
// image layouts used in CNTK
// Nodes that do semantic interpretation of width, height, channel information must know which index they are in.
// Eventually this can go away once we switch completely to cudnn layout.
// The cudnn layout is actually our layout in order W,H,C.
enum ImageLayoutKind
{
return TensorShape(channels, width, height);
HWC, // legacy; default for NDL
CHW // cudnn; default for BrainScript
};
static inline std::string ToString(ImageLayoutKind imageLayoutKind)
{
if (imageLayoutKind == ImageLayoutKind::CHW) return "CHW";
else if (imageLayoutKind == ImageLayoutKind::HWC) return "HWC";
else LogicError("ImageLayout: Invalid ImageLayoutKind");
}
// TODO: we need a constructor from config; that will allow us to generalize
static inline ImageLayoutKind ImageLayoutKindFrom(const wstring & s)
{
if (s == L"CHW" || s == L"cudnn") return ImageLayoutKind::CHW;
else if (s == L"HWC" || s == L"legacy") return ImageLayoutKind::HWC;
else InvalidArgument("ImageLayoutKindFrom: Unknown ImageLayoutKind '%ls', must be 'CHW' (cudnn) or 'HWC' (CNTK legacy)", s.c_str());
}
// interpret TensorShape as an image descriptor
// considering that we support two ways of storingimages
struct ImageDimensions
{
size_t m_width, m_height, m_numChannels;
// interpret TensorShape as image
ImageDimensions(const TensorShape & shape, ImageLayoutKind imageLayoutKind)
{
if (shape.GetRank() != 3)
InvalidArgument("Convolution operation currently only supports 1D or 2D convolution on 3D tensors.");
if (imageLayoutKind == ImageLayoutKind::CHW)
{
m_width = shape[0];
m_height = shape[1];
m_numChannels = shape[2];
}
else if (imageLayoutKind == ImageLayoutKind::HWC)
{
m_width = shape[1];
m_height = shape[2];
m_numChannels = shape[0];
}
else LogicError("WHC: Invalid ImageLayoutKind");
}
ImageDimensions(size_t width, size_t height, size_t numChannels) : m_width(width), m_height(height), m_numChannels(numChannels) {}
// intepret image as TensorShape
static TensorShape AsTensorShape(size_t width, size_t height, size_t numChannels, ImageLayoutKind imageLayoutKind/* = ImageLayoutKind::HWC*/)
{
if (imageLayoutKind == ImageLayoutKind::CHW) return TensorShape(width, height, numChannels);
else if (imageLayoutKind == ImageLayoutKind::HWC) return TensorShape(numChannels, width, height);
else LogicError("ImageLayout: Invalid ImageLayoutKind");
}
TensorShape AsTensorShape(ImageLayoutKind imageLayoutKind) { return AsTensorShape(m_width, m_height, m_numChannels, imageLayoutKind); }
};
}}}

Просмотреть файл

@ -251,7 +251,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Base::Load(fstream, modelVersion);
fstream >> m_hasComputed;
LoadValue(fstream);
}
// Note: This loses the sample layout, but that is recovered by Validate().
}
virtual void DumpNodeInfo(const bool printValues, File& fstream) const override
{

Просмотреть файл

@ -654,8 +654,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
for (auto nodeIter = convolutionNodes.begin(); nodeIter != convolutionNodes.end(); nodeIter++)
{
auto node = dynamic_pointer_cast<ConvolutionNode<float>>(*nodeIter);
node->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
auto nodef = dynamic_pointer_cast<ConvolutionNode<float>>(*nodeIter);
if (nodef)
nodef->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
auto noded = dynamic_pointer_cast<ConvolutionNode<double>>(*nodeIter);
if (noded)
noded->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
}
}
}

Просмотреть файл

@ -35,7 +35,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// please keep this table sorted
if (nodeType == OperationNameOf(CRFNode)) return New<CRFNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode))return New<ClassBasedCrossEntropyWithSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
#if 0// change once we no longer see a perf hit to #ifdef ENABLE_TENSORVIEW
#ifdef ENABLE_BROADCASTING_ELEMENTTIMES
else if (nodeType == L"ColumnElementTimes") return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
#else
else if (nodeType == OperationNameOf(ColumnElementTimesNode)) return New<ColumnElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
@ -76,7 +76,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
else if (nodeType == OperationNameOf(ReconcileMBLayoutNode)) return New<ReconcileMBLayoutNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(RectifiedLinearNode)) return New<RectifiedLinearNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(ReshapeNode)) return New<ReshapeNode<ElemType>>(forward<_Types>(_Args)...);
#if 0// change once we no longer see a perf hit to #ifdef ENABLE_TENSORVIEW
#ifdef ENABLE_BROADCASTING_ELEMENTTIMES
else if (nodeType == L"RowElementTimes") return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
#else
else if (nodeType == OperationNameOf(RowElementTimesNode)) return New<RowElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
@ -85,7 +85,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
else if (nodeType == OperationNameOf(DiagonalNode)) return New<DiagonalNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(RowSliceNode)) return New<RowSliceNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(RowStackNode)) return New<RowStackNode<ElemType>>(forward<_Types>(_Args)...);
#if 0// change once we no longer see a perf hit to #ifdef ENABLE_TENSORVIEW
#ifdef ENABLE_BROADCASTING_ELEMENTTIMES
else if (nodeType == L"Scale") return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
#else
else if (nodeType == OperationNameOf(ScaleNode)) return New<ScaleNode<ElemType>>(forward<_Types>(_Args)...);
@ -107,6 +107,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
else if (nodeType == L"Delay") return New<PastValueNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == L"PerDimMeanVarNormalizationNode") return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == L"PerDimMeanVarNormalizationNode") return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
#if 1
else if (nodeType == OperationNameOf(DeprecatedReshapeNode)) return New<DeprecatedReshapeNode<ElemType>>(forward<_Types>(_Args)...);
#endif
else InvalidArgument("Attempted to instantiate undefined operation %ls.", nodeType.c_str());
}
@ -116,14 +119,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static shared_ptr<ComputationNode<ElemType>> CreateNode(const std::wstring & nodeType, _Types&&... _Args)
{
// check more types
if (nodeType == OperationNameOf(AveragePoolingNode)) return New<AveragePoolingNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(ConvolutionNode)) return New<ConvolutionNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(SparseInputValue)) return New<SparseInputValue<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(InputValue)) return New<InputValue<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(LearnableParameter)) return New<LearnableParameter<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(MaxPoolingNode)) return New<MaxPoolingNode<ElemType>>(forward<_Types>(_Args)...);
if (nodeType == OperationNameOf(AveragePoolingNode)) return New<AveragePoolingNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(BatchNormalizationNode)) return New<BatchNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(ConvolutionNode)) return New<ConvolutionNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(SparseInputValue)) return New<SparseInputValue<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(InputValue)) return New<InputValue<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(LearnableParameter)) return New<LearnableParameter<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(MaxPoolingNode)) return New<MaxPoolingNode<ElemType>>(forward<_Types>(_Args)...);
//else if (nodeType == OperationNameOf(SparseLearnableParameter)) return New<SparseLearnableParameter<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(BatchNormalizationNode)) return New<BatchNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
else return CreateStandardNode<ElemType>(nodeType, forward<_Types>(_Args)...);
}
@ -175,6 +178,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, rows, cols));
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateLearnableParameter(const std::wstring & paramName, const TensorShape & tensorShape)
{
return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, tensorShape));
}
#if 0 // not functional at present
//sparse matrix size is optionally specified
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size)
@ -183,28 +191,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
#endif
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName, const size_t rows, const size_t cols)
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName, const size_t rows)
{
return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, rows, cols));
return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, rows));
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName, const size_t rows, const size_t cols)
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName, const size_t rows)
{
return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, rows, cols));
return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, rows));
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName,
const TensorShape & imageLayout,
const size_t numImages)
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName, const TensorShape & sampleLayout)
{
return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout, numImages));
return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, sampleLayout));
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName,
const TensorShape & imageLayout,
const size_t numImages)
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName, const TensorShape & imageLayout)
{
return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout, numImages));
return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout));
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreatePairNetworkNode(const std::wstring & inputName, const size_t rows, const size_t cols)
@ -215,37 +219,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateConvolutionNode(const std::wstring & nodeName,
const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
const size_t horizontalSubsample, const size_t verticalSubsample,
const bool zeroPadding,
ImageLayoutKind imageLayoutKind, const bool zeroPadding,
const size_t maxTempMemSizeInSamples)
{
return net.AddNodeToNetWithElemType(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
kernelWidth, kernelHeight,
outputChannels,
horizontalSubsample,
verticalSubsample, zeroPadding,
maxTempMemSizeInSamples));
kernelWidth, kernelHeight, outputChannels,
horizontalSubsample, verticalSubsample, imageLayoutKind,
zeroPadding,
maxTempMemSizeInSamples));
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateMaxPoolingNode(const std::wstring & nodeName,
const size_t windowWidth,
const size_t windowHeight,
const size_t horizontalSubsample,
const size_t verticalSubsample)
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind)
{
return net.AddNodeToNetWithElemType(New<MaxPoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
windowWidth, windowHeight,
horizontalSubsample,
verticalSubsample));
return net.AddNodeToNetWithElemType(New<MaxPoolingNode<ElemType>>(net.GetDeviceId(), nodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind));
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateAveragePoolingNode(const std::wstring & nodeName, const size_t windowWidth,
const size_t windowHeight, const size_t horizontalSubsample,
const size_t verticalSubsample)
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateAveragePoolingNode(const std::wstring & nodeName,
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind)
{
return net.AddNodeToNetWithElemType(New<AveragePoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
windowWidth, windowHeight,
horizontalSubsample,
verticalSubsample));
return net.AddNodeToNetWithElemType(New<AveragePoolingNode<ElemType>>(net.GetDeviceId(), nodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind));
}
// this is the catch-all for all cases not covered as special cases above
@ -274,49 +267,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convolution(const ComputationNodePtr weight,
const ComputationNodePtr inputValues,
const size_t kernelWidth,
const size_t kernelHeight,
const size_t outputChannels,
const size_t horizontalSubsample,
const size_t verticalSubsample,
const bool zeroPadding,
const std::wstring nodeName,
const size_t maxTempMemSizeInSamples)
const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding, const size_t maxTempMemSizeInSamples,
const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
kernelWidth, kernelHeight,
outputChannels,
horizontalSubsample,
verticalSubsample, zeroPadding,
kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding,
maxTempMemSizeInSamples),
weight, inputValues);
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MaxPooling(const ComputationNodePtr inputValues,
const size_t windowWidth,
const size_t windowHeight,
const size_t horizontalSubsample,
const size_t verticalSubsample,
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<MaxPoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
windowWidth, windowHeight,
horizontalSubsample,
verticalSubsample),
windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind),
inputValues);
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::AveragePooling(const ComputationNodePtr inputValues,
const size_t windowWidth,
const size_t windowHeight,
const size_t horizontalSubsample,
const size_t verticalSubsample,
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<AveragePoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
windowWidth, windowHeight,
horizontalSubsample,
verticalSubsample),
windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind),
inputValues);
}
@ -486,7 +460,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return net.AddNodeToNetAndAttachInputs(New<SumElementsNode<ElemType>>(net.GetDeviceId(), nodeName), a);
}
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Scale(const ComputationNodePtr scalar, const ComputationNodePtr matrix, const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<ScaleNode<ElemType>>(net.GetDeviceId(), nodeName), scalar, matrix);
@ -513,7 +487,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return net.AddNodeToNetAndAttachInputs(New<ElementTimesNode<ElemType>>(net.GetDeviceId(), nodeName), a, b);
}
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<RowElementTimesNode<ElemType>>(net.GetDeviceId(), nodeName), a, b);
@ -561,12 +535,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Reshape(const ComputationNodePtr a,
const size_t numRows,
const TensorShape & imageLayout,
const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<ReshapeNode<ElemType>>(net.GetDeviceId(), nodeName, numRows, imageLayout), a);
return net.AddNodeToNetAndAttachInputs(New<ReshapeNode<ElemType>>(net.GetDeviceId(), nodeName, imageLayout), a);
}
#if 1
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::DeprecatedReshape(const ComputationNodePtr a,
const size_t numRows,
const TensorShape & imageLayout,
const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<DeprecatedReshapeNode<ElemType>>(net.GetDeviceId(), nodeName, numRows, imageLayout), a);
}
#endif
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowRepeat(const ComputationNodePtr a, const size_t num_repeat, const std::wstring nodeName)
{
@ -578,14 +560,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return net.AddNodeToNetAndAttachInputs(New<DiagonalNode<ElemType>>(net.GetDeviceId(), nodeName), a);
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName)
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<PastValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, col_size, timeStep), a);
return net.AddNodeToNetAndAttachInputs(New<PastValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, timeStep), a);
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName)
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<FutureValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, col_size, timeStep), a);
return net.AddNodeToNetAndAttachInputs(New<FutureValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, timeStep), a);
}
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Parallel(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)

Просмотреть файл

@ -9,6 +9,7 @@
#include "ComputationNetwork.h"
#include "TrainingCriterionNodes.h" // for NCEEvalMode
#include "ScriptableObjects.h"
#include "TensorShape.h"
#include <string>
namespace Microsoft { namespace MSR { namespace CNTK {
@ -39,47 +40,34 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// TODO: separate into nodes that have inputs and those that duplicate functions with input adding except just not adding inputs. Clear?
ComputationNodePtr CreateLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols);
ComputationNodePtr CreateLearnableParameter(const std::wstring & paramName, const TensorShape & tensorShape);
//sparse matrix size is optionally specified
//ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0);
ComputationNodePtr CreateInputNode(const std::wstring & inputName, const size_t rows, const size_t cols);
ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const size_t rows, const size_t cols);
ComputationNodePtr CreateInputNode(const std::wstring & inputName, const TensorShape & imageLayout, const size_t numImages);
ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const TensorShape & imageLayout, const size_t numImages);
ComputationNodePtr CreateInputNode(const std::wstring & inputName, const size_t rows);
ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const size_t rows);
ComputationNodePtr CreateInputNode(const std::wstring & inputName, const TensorShape & sampleLayout);
ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const TensorShape & sampleLayout);
ComputationNodePtr CreatePairNetworkNode(const std::wstring & inputName, const size_t rows, const size_t cols);
ComputationNodePtr CreateConvolutionNode(const std::wstring & nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
ComputationNodePtr CreateMaxPoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
ComputationNodePtr CreateAveragePoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
ComputationNodePtr CreateConvolutionNode(const std::wstring & nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
ComputationNodePtr CreateMaxPoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
ComputationNodePtr CreateAveragePoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
// this is the catch-all for all cases not covered as special cases above
// Unlike the specialized ones above, this one creates nodes by type given as a string.
ComputationNodePtr CreateComputationNode(const std::wstring & nodeType, const std::wstring & nodeName);
// TODO: These next three functions are wrappers around CreateXXXNode(). Remove these.
ComputationNodePtr Parameter(const size_t rows, size_t cols, const std::wstring nodeName = L"") { return CreateLearnableParameter(nodeName, rows, cols); } // TODO: remove
ComputationNodePtr Input(const size_t rows, const size_t cols, const std::wstring nodeName = L"") { return CreateInputNode(nodeName, rows, cols); } // TODO: remove
ComputationNodePtr Input(const TensorShape & imageLayout, const size_t numImages, const std::wstring nodeName = L"") { return CreateInputNode(nodeName, imageLayout, numImages); } // TODO: remove
// The following functions create nodes and link them to the network and their inputs.
// TODO: Do we need both this set and the one above that does not add inputs? Can they share more code?
ComputationNodePtr PairNetwork(const ComputationNodePtr & a, const std::wstring nodeName = L"");
ComputationNodePtr Convolution(const ComputationNodePtr weight,
const ComputationNodePtr inputValues,
const size_t kernelWidth,
const size_t kernelHeight,
const size_t outputChannels,
const size_t horizontalSubsample,
const size_t verticalSubsample,
const bool zeroPadding = false,
const std::wstring nodeName = L"",
const size_t maxTempMemSizeInSamples = 0);
const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0,
const std::wstring nodeName = L"");
ComputationNodePtr MaxPooling(const ComputationNodePtr inputValues,
const size_t windowWidth,
const size_t windowHeight,
const size_t horizontalSubsample,
const size_t verticalSubsample,
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
const std::wstring nodeName = L"");
ComputationNodePtr AveragePooling(const ComputationNodePtr inputValues,
const size_t windowWidth,
const size_t windowHeight,
const size_t horizontalSubsample,
const size_t verticalSubsample,
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
const std::wstring nodeName = L"");
ComputationNodePtr ErrorPrediction(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr PerDimMeanVarNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");
@ -111,14 +99,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr Hardmax(const ComputationNodePtr a, const std::wstring nodeName = L"");
ComputationNodePtr LogSoftmax(const ComputationNodePtr a, const std::wstring nodeName = L"");
ComputationNodePtr Sum(const ComputationNodePtr a, const std::wstring nodeName = L"");
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
ComputationNodePtr Scale(const ComputationNodePtr scalar, const ComputationNodePtr matrix, const std::wstring nodeName = L"");
#endif
ComputationNodePtr Transpose(const ComputationNodePtr matrix, const std::wstring nodeName = L"");
ComputationNodePtr Times(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr TransposeTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr ElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
ComputationNodePtr RowElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr ColumnElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
#endif
@ -129,11 +117,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNodePtr Plus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr Minus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr Dropout(const ComputationNodePtr a, const std::wstring nodeName = L"");
ComputationNodePtr Reshape(const ComputationNodePtr a, const size_t num_rows, const TensorShape & imageLayout, const std::wstring nodeName = L"");
ComputationNodePtr Reshape(const ComputationNodePtr a, const TensorShape & imageLayout, const std::wstring nodeName = L"");
#if 1 // legacy
ComputationNodePtr DeprecatedReshape(const ComputationNodePtr a, const size_t num_rows, const TensorShape & imageLayout, const std::wstring nodeName = L"");
#endif
ComputationNodePtr RowRepeat(const ComputationNodePtr a, const size_t num_repeat, const std::wstring nodeName = L"");
ComputationNodePtr Diagonal(const ComputationNodePtr a, const std::wstring nodeName = L"");
ComputationNodePtr PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName = L"");
ComputationNodePtr FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName = L"");
ComputationNodePtr PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
ComputationNodePtr FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
ComputationNodePtr Parallel(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
ComputationNodePtr RowSlice(const ComputationNodePtr a, const size_t start_index, const size_t num_rows, const std::wstring nodeName = L"");
ComputationNodePtr RowStack(const std::vector<ComputationNodePtr> pinputs, const std::wstring nodeName = L"");

Просмотреть файл

@ -649,7 +649,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// We do call validate(final) as many times as needed, since stuff may have changed underneath.
node->PrintSelfBeforeValidation();
node->Validate(isFinalValidationPass/*final*/); // all nodes have been visited: do verification instead of just inference
fprintf(stderr, " -> [%lu, %s%lu]", node->GetNumRows(), node->HasMBLayout() ? "MBSize " : "", node->GetNumCols());
fprintf(stderr, " -> [%lu [%s], %s%lu]", node->GetNumRows(), string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? "MBSize " : "", node->GetNumCols());
node->m_visited = true;
// also take the opportunity to propagate m_needsGradient
auto needsGradient = node->m_needsGradient;

Просмотреть файл

@ -155,7 +155,7 @@
<ClInclude Include="..\Common\Include\Basics.h" />
<ClInclude Include="..\Common\Include\BestGpu.h" />
<ClInclude Include="..\Common\Include\Config.h" />
<ClInclude Include="..\Common\Include\DataTensor.h" />
<ClInclude Include="..\Common\Include\TensorShape.h" />
<ClInclude Include="..\Common\Include\File.h" />
<ClInclude Include="..\Common\Include\fileutil.h" />
<ClInclude Include="..\Common\Include\Platform.h" />

Просмотреть файл

@ -117,7 +117,7 @@
<ClInclude Include="EsotericNodes.h">
<Filter>Nodes</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\DataTensor.h">
<ClInclude Include="..\Common\Include\TensorShape.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\Config.h">

Просмотреть файл

@ -9,7 +9,7 @@
#include "ComputationNode.h"
#include "InputAndParamNodes.h"
#include "ComputationNetworkBuilder.h" // TODO: We should only pull in NewComputationNodeFromConfig(). Nodes should not know about network at large.
#include "DataTensor.h"
#include "TensorShape.h"
#ifndef let
#define let const auto
@ -72,6 +72,7 @@ namespace Microsoft {
size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols();
#if 1//ndef ENABLE_TENSORVIEW
// TODO: This test will go away once we switch to full tensor lib.
if (isFinalValidationPass && !(
(rows0 == rows1 && (Input(0)->GetMBLayout() == Input(1)->GetMBLayout() || cols0 == cols1)) || // matching size (obvious case)
@ -81,6 +82,9 @@ namespace Microsoft {
{
LogicError("The Matrix dimensions in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
}
#else
rows0; rows1;
#endif
// result has tensor shape with dimensions being the max over both
let shape0 = GetInputSampleLayout(0);
@ -98,7 +102,7 @@ namespace Microsoft {
dims[k] = dim1; // then use dimension we broadcast to
else if (dim1 == 1) // if [1] is broadcasting
; // dims is already correct
else if (dim1 != dims[k]) // no broadcasting: they must match
else if (isFinalValidationPass && dim1 != dims[k]) // no broadcasting: they must match
InvalidArgument("%ls %ls operation: Input dimensions [%s] and [%s] are not compatible.",
NodeName().c_str(), OperationName().c_str(), string(shape0).c_str(), string(shape1).c_str());
}
@ -181,9 +185,6 @@ namespace Microsoft {
if (m_sampleLayout.GetDim(k) == 0 || m_sampleLayout.GetDim(k) == SIZE_MAX)
layoutPlausible = false;
}
// some code initializes it to (1,1,rowDim)
if (m_sampleLayout.GetRank() == 3 && m_sampleLayout.GetDim(0) == 1 && m_sampleLayout.GetDim(1) == 1)
layoutPlausible = false;
// check dimension
if (GetNumRows() != m_sampleLayout.GetNumElements())
layoutPlausible = false;
@ -204,6 +205,8 @@ namespace Microsoft {
for (size_t i = 0; i < GetNumInputs(); i++)
{
size_t rank = Input(i)->GetAndValidateSampleLayout().GetRank();
if (!HasMBLayout()) // no MBLayout: last dim is column dimension
rank++;
if (maxRank < rank)
maxRank = rank;
}
@ -215,8 +218,9 @@ namespace Microsoft {
TensorShape ComputationNodeBase::GetTensorShape(size_t rank, const FrameRange & fr) const
{
//GetAndValidateSampleLayout(); // no need to validate because rank comes from DetermineElementwiseTensorRank() which validates all
if (!HasMBLayout()) // no MBLayout: just return sample layout (if other participants have layout, tensor lib will broadcast)
return GetSampleLayout(); // .Pad(rank); // no need for padding
if (!HasMBLayout())
return GetSampleLayout().Append(GetSampleLayout().GetRank(), GetNumCols()); // last dim is column dimension
// TODO: This is not nice! Instead, of no MBLayout then have sample layout explain whole matrix.
else if (fr.IsAllFrames())
{
// we have an MBLayout, and for refers to the entire MB
@ -301,6 +305,7 @@ namespace Microsoft { namespace MSR { namespace ScriptableObjects {
static TensorShape TensorShapeFromConfig(const IConfigRecord & config)
{
const auto & valp = config[L"dims"];
// TODO: Add code that if input is already a tensor shape it is also OK.
if (valp.Is<ConfigArray>())
return TensorShape(valp.AsRef<ConfigArray>().AsVector<size_t>([&](const wstring & msg){ valp.Fail(msg); }));
else

Просмотреть файл

@ -10,7 +10,7 @@
#include "TensorView.h"
#include "ScriptableObjects.h"
#include "Sequences.h"
#include "DataTensor.h"
#include "TensorShape.h"
#include "MatrixPool.h"
#include <unordered_set>
@ -26,7 +26,9 @@
#include <sstream>
#include <iostream>
// #define ENABLE_TENSORVIEW // flip this switch once the tensor lib is confirmed to be working
// remove these following two #defines once the tensor lib works
#define ENABLE_TENSORVIEW // if set then tensor lib is used instead of old Matrix implementations, wherever such an implementation exists
#define ENABLE_BROADCASTING_ELEMENTTIMES // if set then ScaleNode and Row/ColumnElementTimes are redirected to ElementTimes
#define DEFAULT_HIDDEN_ACTIVATION 0.1
@ -307,6 +309,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// - PairNetworkNode
// - LSTMNode
// set our dimensions (rows, cols, sample layout)
// TODO: Separate SetDims() into version with and without MBLayout.
void SetDims(const TensorShape & sampleLayout, size_t cols)
{
m_sampleLayout = sampleLayout;
@ -501,9 +504,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
const char * mbSizeMark = child->m_pMBLayout ? "MBSize " : "";
if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout.GetWidth() != 1 || child->m_sampleLayout.GetNumChannels() != 1)) // looks like an image: use WHC notation
fprintf(stderr, "%ls[%lu {W=%lu, H=%lu, C=%lu}, %s%lu]", child->NodeName().c_str(), child->GetNumRows(),
child->m_sampleLayout.GetWidth(), child->m_sampleLayout.GetHeight(), child->m_sampleLayout.GetNumChannels(), mbSizeMark, child->GetNumCols());
if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1)) // looks like an image: use WHC notation
fprintf(stderr, "%ls[%lu [%s] {W=%lu, H=%lu, C=%lu}, %s%lu]", child->NodeName().c_str(), child->GetNumRows(), string(child->m_sampleLayout).c_str(),
child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0], mbSizeMark, child->GetNumCols());
//BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct.
else if (child->m_sampleLayout.GetRank() > 1) // tensor: output the tensor dimensions --TODO: there will be no numRows in the future, only the tensor
fprintf(stderr, "%ls[%lu [%s], %s%lu]", child->NodeName().c_str(), child->GetNumRows(), string(child->m_sampleLayout).c_str(), mbSizeMark, child->GetNumCols());
else
@ -538,14 +542,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop;
}
// TODO: Remove this.
// used from:
// - Plus/Minus/ElementTimesNode --> replace by max dim over inputs. Make this standard behavior for all binary element-wise ops.
bool IsInputAnImage(const size_t index) const
{
return m_inputs[index]->m_sampleLayout.IsInputAnImage();
}
const size_t GetNumInputs() const { return m_inputs.size(); }
virtual void SetInput(const size_t childIndex, const ComputationNodeBasePtr& node) = 0;
@ -825,7 +821,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
fstream >> Value();
// above reads dimensions, so we must update our own m_numRows/m_numCols
SetDims(TensorShape(Value().GetNumRows()), Value().GetNumCols());
// BUGBUG: This looses the sample layout (tensor shape). It should be serialized as well.
// BUGBUG: This looses the sample layout (tensor shape). The caller must know this and fix it up if needed (currently needed for LearnableParameterNode).
}
// reader updated m_functionValue--update our internal state, i.e. m_numCols
@ -1403,7 +1399,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class C, class... _Types> inline shared_ptr<C> New(_Types&&... _Args)
{
return make_shared<C>(forward<_Types>(_Args)...);
//return ComputationNode<typename C::OurElemType>::template New<C>(forward<_Types>(_Args)...);
}
// =======================================================================
@ -1526,7 +1521,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#define UsingComputationNodeMembers /*without OperationName; needed to support inconsistent pattern of InputValue--TODO: This comment it out of date. */ \
protected: \
typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr; \
using Base::m_deviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
using Base::m_deviceId; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
using Base::m_pMBLayout; using Base::GetNumTimeSteps; using Base::GetNumParallelSequences; \
using Base::MaskMissingColumnsToZero; using Base::MaskMissingValueColumnsToZero; using Base::MaskMissingGradientColumnsToZero; using Base::InvalidateMissingValueColumns; using Base::InvalidateMissingGradientColumns; \
using Base::DataFor; using Base::ValueFor; using Base::Gradient; using Base::GradientFor; \
@ -1540,12 +1535,12 @@ protected: \
using Base::GetNumInputs; using Base::ZeroGradientsOfInputs; using Base::VerifyDims; \
using Base::ConstOnes; \
using Base::DetermineElementwiseTensorRank; \
using Base::GetInputSampleLayout; using Base::InferMBLayoutFromInputsForStandardCase; \
using Base::GetSampleLayout; using Base::GetInputSampleLayout; using Base::InferMBLayoutFromInputsForStandardCase; \
using Base::CopyTo; using Base::CreateUniqNodeName; using Base::DetachInputs; using Base::GetInputsFromConfig; \
using Base::DumpNodeInfo; using Base::EnumerateNodes; \
using Base::HasMBLayout; using Base::GetMBLayout; using Base::LinkToMBLayout; \
using Base::Input; using Base::SetInput; \
using Base::IsInputAnImage; using Base::IsEqualTo; using Base::IsOutputOlderThanInputs; using Base::IsLeaf; using Base::SetParameterUpdateRequired; \
using Base::IsEqualTo; using Base::IsOutputOlderThanInputs; using Base::IsLeaf; using Base::SetParameterUpdateRequired; \
using Base::Load; \
using Base::PrintNodeValuesToFile; using Base::PrintSelfBeforeValidation; \
using Base::Save; using Base::UpdateFunctionMBSize; \
@ -1570,6 +1565,31 @@ protected: /* some boilerplate goes here */ \
// a few standard base classes for N-nary operations
// =======================================================================
// -----------------------------------------------------------------------
// UnaryElementWiseNode (operand)
//
// unary elementwise operations that are implemented with the tensor lib
//
// Derived clases only need to override ForwardProp() and BackpropTo().
// -----------------------------------------------------------------------
template<class ElemType>
class UnaryElementWiseNode : public ComputationNode<ElemType>, public NumInputs<1>
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
public:
UnaryElementWiseNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
{
ValidateUnaryMap(isFinalValidationPass);
}
};
#define UsingUnaryElementwiseNodeBaseMembers UsingComputationNodeMembersBoilerplate;
// -----------------------------------------------------------------------
// BinaryElementWiseNode (operand1, operand2)
//
@ -1598,13 +1618,9 @@ protected: /* some boilerplate goes here */ \
#endif
}
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
{
// By default, the BinaryElementWiseNode does not require any of it's input's values for computing
// the gradients of its input nodes
UNREFERENCED_PARAMETER(childIndex);
return false;
}
// By default, the BinaryElementWiseNode does not require any of it's input's values for computing
// the gradients of its input nodes
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
virtual void /*IComputationNode::*/BeginForwardProp() override // called before first iteration step of ForwardProp()
{

Просмотреть файл

@ -30,9 +30,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// ConvolutionNode (convolutionWeights, inputFeature)
// -----------------------------------------------------------------------
// convolutional network
// This follows "high performance convolutional neural networks for document processing" by Kumar Chellapilla, Sidde Puri, and Patrice Simard.
// Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
// Convolutions (incl. pooling) support two different storage formats:
// BUGBUG: These are currently hard-selected depending on circumstances, without being reflected in TensoShape.
//
// * legacy mode (CPU and GPU without cudnn): Channels are tuples of scalars
//
// This follows "high performance convolutional neural networks for document processing" by Kumar Chellapilla, Sidde Puri, and Patrice Simard.
// Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
//
// - input : [C x W x H x T] or ARRAY[1..T] OF ARRAY[1..H] OF ARRAY[1..W] OF ARRAY[1..C]
// - output : [C' x W' x H' x T] or ARRAY[1..T] OF ARRAY[1..H'] OF ARRAY[1..W'] OF ARRAY[1..C']
// - filter : [C' x W" x H" x C ] or ARRAY[1..C] OF ARRAY[1..H"] OF ARRAY[1..W"] OF ARRAY[1..C']
//
// * GPU with cudnn: Channels are planes
//
// - input : [W x H x C x T] or ARRAY[1..T] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W]
// - output : [W' x H' x C' x T] or ARRAY[1..T] OF ARRAY[1..C'] OF ARRAY[1..H'] OF ARRAY[1..W']
// - filter : [W" x H" x C x C' ] or ARRAY[1..C'] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W]
//
// where:
// - using ' for output and " for filter
// - T = samples (NVidia calls this N)
// - W, H = width, height (W', H' for output, W", H" for kernel)
// - C = input channels
// - 3 for color images, 1 for B&W images
// - for hidden layer: dimension of activation vector for each pixel
// - C' = output channels = dimension of activation vector for each pixel (also called N by NVidia, inconsistently)
template<class ElemType>
class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
{
@ -44,22 +67,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_kernelWidth(SIZE_MAX), m_kernelHeight(SIZE_MAX),
// initialize to dummy values so we catch missing initialization
m_horizontalSubsample(SIZE_MAX), m_verticalSubsample(SIZE_MAX),
m_zeroPadding(false), m_maxTempMemSizeInSamples(SIZE_MAX)
m_zeroPadding(false), m_maxTempMemSizeInSamples(SIZE_MAX),
m_imageLayoutKind(ImageLayoutKind::HWC)
{
SetDims(ImageLayoutWHC(1, 1, 0), 0); // TODO: what is this magic #channels == 0? Can this even be initialized at this time, or only inferred?
SetDims(ImageDimensions::AsTensorShape(1, 1, 0, m_imageLayoutKind), 0);
}
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0) :
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0) :
Base(deviceId, name),
m_outputChannels(outputChannels),
m_kernelWidth(kernelWidth), m_kernelHeight(kernelHeight),
m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample),
m_zeroPadding(zeroPadding), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples)
m_zeroPadding(zeroPadding), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples),
m_imageLayoutKind(imageLayoutKind)
{
SetDims(ImageLayoutWHC(1, 1, outputChannels), 0);
m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId);
SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0); // TODO: necessary?
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
}
ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp) :
ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelWidth"), configp->Get(L"kernelHeight"), configp->Get(L"outputChannels"),
configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"),
configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")),
configp->Get(L"zeroPadding"), configp->Get(L"maxTempMemSizeInSamples"))
{
// weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0
@ -70,18 +97,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
Base::Save(fstream);
fstream << m_kernelWidth << m_kernelHeight << m_horizontalSubsample << m_verticalSubsample;
fstream << m_sampleLayout.GetNumChannels();
uint32_t imageLayoutKind = (uint32_t)m_imageLayoutKind;
uint32_t outputChannels = (uint32_t)m_outputChannels;
fstream << outputChannels << imageLayoutKind;
fstream << m_zeroPadding << m_maxTempMemSizeInSamples;
}
void Load(File& fstream, size_t modelVersion) override
{
Base::Load(fstream, modelVersion);
fstream >> m_kernelWidth >> m_kernelHeight >> m_horizontalSubsample >> m_verticalSubsample;
size_t outputChannels;
fstream >> outputChannels;
SetDims(ImageLayoutWHC(1, 1, outputChannels), 0);
fstream >> m_kernelWidth >> m_kernelHeight >> m_horizontalSubsample >> m_verticalSubsample;
uint32_t imageLayoutKind, outputChannels;
fstream >> outputChannels >> imageLayoutKind;
m_imageLayoutKind = (ImageLayoutKind) imageLayoutKind;
m_outputChannels = outputChannels;
SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0); // TODO: needed?
fstream >> m_zeroPadding >> m_maxTempMemSizeInSamples;
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
}
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -100,6 +132,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples;
node->m_imageLayoutKind = m_imageLayoutKind;
*node->m_tempMatrix = *m_tempMatrix;
}
}
@ -139,7 +173,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
// REVIEW alexeyk: setting batch size, can it be done elsewhere in a single place? TODO: Yes, in BeginForwardProp().
// update the tensor dimension w.r.t. number of samples
size_t batchSize = sliceInput1Value.GetNumCols();
m_inT->setN(batchSize);
m_outT->setN(batchSize);
@ -154,7 +188,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#endif
}
// BUGBUG: Should not be here. Use PlusNode and m_sampleLayout.
// BUGBUG: Should not be here. Use PlusNode and m_sampleLayout. TODO: Bad naming:'output' is actually an 'input'
void AddBias(const Matrix<ElemType>& output, const Matrix<ElemType>& bias, Matrix<ElemType>& dst)
{
assert(m_convEng != nullptr);
@ -173,83 +207,80 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase();
// get input tensor shape
auto inputSampleLayout = GetInputSampleLayout(1);
// get input and output tensor shape and interpret as image dimensions
auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);
if (inputSampleLayout.GetWidth() < m_kernelWidth || inputSampleLayout.GetHeight() < m_kernelHeight)
InvalidArgument("inputWidth must >= kernelWidth and inputHeight must >= kernelHeight.");
if (isFinalValidationPass && (inDims.m_width < m_kernelWidth || inDims.m_height < m_kernelHeight))
InvalidArgument("%ls %ls operation requires that input width be >= kernelWidth and input height >= kernelHeight.", NodeName().c_str(), OperationName().c_str());
// determine output tensor shape
// WATCH OUT: Number of channels is tucked away in m_sampleLayout and must be propagated.
TensorShape outputSampleLayout;
if (m_zeroPadding)
{
const int kernelWidthCenter = m_kernelWidth % 2;
const int kernelHeightCenter = m_kernelHeight % 2;
outputSampleLayout = ImageLayoutWHC(
(inputSampleLayout.GetWidth() - kernelWidthCenter) / m_horizontalSubsample + 1,
(inputSampleLayout.GetHeight() - kernelHeightCenter) / m_verticalSubsample + 1,
m_sampleLayout.GetNumChannels());
}
else
{
outputSampleLayout = ImageLayoutWHC(
(inputSampleLayout.GetWidth() - m_kernelWidth) / m_horizontalSubsample + 1,
(inputSampleLayout.GetHeight() - m_kernelHeight) / m_verticalSubsample + 1,
m_sampleLayout.GetNumChannels());
}
const int kernelWidthCenter = m_zeroPadding ? m_kernelWidth % 2 : m_kernelWidth;
const int kernelHeightCenter = m_zeroPadding ? m_kernelHeight % 2 : m_kernelHeight;
auto outDims = ImageDimensions(
(inDims.m_width - kernelWidthCenter) / m_horizontalSubsample + 1,
(inDims.m_height - kernelHeightCenter) / m_verticalSubsample + 1,
m_outputChannels);
size_t weightCols = m_kernelWidth * m_kernelHeight * inputSampleLayout.GetNumChannels();
size_t weightCols = m_kernelWidth * m_kernelHeight * inDims.m_numChannels;
// check/infer input [0] (weights)
if (Input(0)->Value().HasNoElements())
ValidateInferInputDims(0, outputSampleLayout.GetNumChannels(), weightCols);
ValidateInferInputDims(0, m_outputChannels, weightCols);
if (isFinalValidationPass && (Input(0)->GetNumCols() != weightCols || Input(0)->GetNumRows() != outputSampleLayout.GetNumChannels()))
LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int)outputSampleLayout.GetNumChannels(), (int)weightCols);
if (isFinalValidationPass && (Input(0)->GetNumCols() != weightCols || Input(0)->GetNumRows() != m_outputChannels))
LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int)m_outputChannels, (int)weightCols);
size_t inputDim = inputSampleLayout.GetWidth() * inputSampleLayout.GetHeight() * inputSampleLayout.GetNumChannels();
// check/infer input [1] (data)
size_t inputDim = inDims.m_width * inDims.m_height * inDims.m_numChannels;
if (Input(1)->GetNumRows() == 0)
ValidateInferInputDims(1, inputDim, Input(1)->GetNumCols());
if (isFinalValidationPass && Input(1)->GetNumRows() != inputDim)
LogicError("Each column of input to the convolution node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels.", NodeName().c_str(), (int)inputDim);
LogicError("Each column of inDims to the convolution node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels.", NodeName().c_str(), (int)inputDim);
// that's our dimension
SetDims(outputSampleLayout, Input(1)->GetNumCols());
SetDims(outDims.AsTensorShape(m_imageLayoutKind), Input(1)->GetNumCols());
// set up the various engines and descriptor objects
// REVIEW alexeyk: is there a better place to create engines?
if (m_factory == nullptr)
m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId);
if (m_convEng == nullptr)
m_convEng = m_factory->CreateConvEngine(m_deviceId, m_maxTempMemSizeInSamples);
if (m_inT == nullptr)
m_inT = m_factory->CreateTensor(inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels(), 1);
if (m_filterT == nullptr)
m_filterT = m_factory->CreateFilter(m_kernelWidth, m_kernelHeight, inputSampleLayout.GetNumChannels(), m_sampleLayout.GetNumChannels());
if (m_outT == nullptr)
m_outT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
if (m_convDesc == nullptr)
m_convDesc = m_factory->CreateConvDescriptor(*m_inT, *m_filterT, m_horizontalSubsample, m_verticalSubsample, m_zeroPadding);
// REVIEW alexeyk: create per-channel (shared) bias. Consider adding other types of biases.
if (m_biasT == nullptr)
m_biasT = m_factory->CreateTensor(1, 1, m_sampleLayout.GetNumChannels(), 1);
if (isFinalValidationPass)
{
// set up the various engines and descriptor objects
// REVIEW alexeyk: is there a better place to create engines?
assert(m_factory);
//if (m_factory == nullptr)
// m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
// TODO: This seems to expose too much internal knowlegde of the engine to the ConvolutionNode().
// Why not just pass everything to the engine creator, and get one object that holds everything.
if (m_convEng == nullptr)
m_convEng = m_factory->CreateConvEngine(m_deviceId, m_maxTempMemSizeInSamples);
if (m_inT == nullptr)
m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
if (m_filterT == nullptr)
m_filterT = m_factory->CreateFilter(m_kernelWidth, m_kernelHeight, inDims.m_numChannels, m_outputChannels);
if (m_outT == nullptr)
m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
if (m_convDesc == nullptr)
m_convDesc = m_factory->CreateConvDescriptor(*m_inT, *m_filterT, m_horizontalSubsample, m_verticalSubsample, m_zeroPadding);
// REVIEW alexeyk: create per-channel bias (shared across all pixels). Consider adding other types of biases.
if (m_biasT == nullptr)
m_biasT = m_factory->CreateTensor(1, 1, outDims.m_numChannels, 1);
}
}
void DumpNodeInfo(const bool printValues, File& fstream) const override
{
Base::DumpNodeInfo(printValues, fstream);
auto inputSampleLayout = GetInputSampleLayout(1);
auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);
auto outDims = ImageDimensions(m_sampleLayout, m_imageLayoutKind);
char str[4096];
sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu] \n", inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels());
sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu] \n", inDims.m_width, inDims.m_height, inDims.m_numChannels);
fstream << string(str);
sprintf(str, "Kernel[Width:%lu, Height:%lu] SubSample[Horizontal:%lu, Vertical:%lu]\n", m_kernelWidth, m_kernelHeight, m_horizontalSubsample, m_verticalSubsample);
fstream << string(str);
sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu] \n", m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels());
sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu] \n", outDims.m_width, outDims.m_height, outDims.m_numChannels);
fstream << string(str);
sprintf(str, "ZeroPadding=%ls maxTempMemSizeInSamples=%lu\n", m_zeroPadding? L"true" : L"false", m_maxTempMemSizeInSamples);
sprintf(str, "zeroPadding=%ls maxTempMemSizeInSamples=%lu\n", m_zeroPadding? L"true" : L"false", m_maxTempMemSizeInSamples);
fstream << string(str);
}
@ -273,6 +304,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
private:
size_t m_outputChannels;
size_t m_kernelWidth, m_kernelHeight;
size_t m_horizontalSubsample, m_verticalSubsample;
bool m_zeroPadding;
bool m_1DConvolutionOnGPUSparse;
shared_ptr<Matrix<ElemType>> m_tempMatrix;
size_t m_maxTempMemSizeInSamples; // can change during runtime
ImageLayoutKind m_imageLayoutKind; // how to interpret the tensor (which dimensions are X/Y and C)
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
@ -281,14 +323,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
std::unique_ptr<ConvolutionTensor4D> m_outT;
std::unique_ptr<ConvolutionDescriptor> m_convDesc;
std::unique_ptr<ConvolutionTensor4D> m_biasT;
size_t m_kernelWidth, m_kernelHeight;
size_t m_horizontalSubsample, m_verticalSubsample;
bool m_zeroPadding;
bool m_1DConvolutionOnGPUSparse;
shared_ptr<Matrix<ElemType>> m_tempMatrix;
size_t m_maxTempMemSizeInSamples; // can change during runtime
};
template class ConvolutionNode<float>;
@ -298,8 +332,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// PoolingNodeBase (input)
// -----------------------------------------------------------------------
// Max/Average Pooling: support multi channel
// Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
template<class ElemType>
class PoolingNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
{
@ -308,17 +340,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name),
m_windowWidth(SIZE_MAX), m_windowHeight(SIZE_MAX),
m_horizontalSubsample(SIZE_MAX), m_verticalSubsample(SIZE_MAX)
m_horizontalSubsample(SIZE_MAX), m_verticalSubsample(SIZE_MAX),
m_imageLayoutKind(ImageLayoutKind::HWC)
{ }
PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) :
PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind) :
Base(deviceId, name),
m_windowWidth(windowWidth), m_windowHeight(windowHeight),
m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample)
m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample),
m_imageLayoutKind(imageLayoutKind)
{
m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId);
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
}
PoolingNodeBase(const ScriptableObjects::IConfigRecordPtr configp) :
PoolingNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"))
PoolingNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")))
{
// input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample
AttachInputs(configp, this->GetExpectedNumInputs());
@ -327,13 +361,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void Save(File& fstream) const override
{
Base::Save(fstream);
fstream << m_windowWidth << m_windowHeight << m_horizontalSubsample << m_verticalSubsample;
uint32_t imageLayoutKind = (uint32_t)m_imageLayoutKind;
uint32_t windowWidth = (uint32_t)m_windowWidth;
fstream << windowWidth << imageLayoutKind << m_windowHeight << m_horizontalSubsample << m_verticalSubsample;
}
void Load(File& fstream, size_t modelVersion) override
{
Base::Load(fstream, modelVersion);
fstream >> m_windowWidth >> m_windowHeight >> m_horizontalSubsample >> m_verticalSubsample;
uint32_t imageLayoutKind, windowWidth;
fstream >> windowWidth >> imageLayoutKind >> m_windowHeight >> m_horizontalSubsample >> m_verticalSubsample;
m_windowWidth = windowWidth;
m_imageLayoutKind = (ImageLayoutKind)imageLayoutKind;
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
}
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -351,6 +391,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
node->m_inputSizePerSample = m_inputSizePerSample;
node->m_outputSizePerSample = m_outputSizePerSample;
node->m_imageLayoutKind = m_imageLayoutKind;
}
}
@ -388,20 +430,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase();
// get input tensor shape
auto inputSampleLayout = GetInputSampleLayout(0);
// get input tensor shape and interpret as image dimensions
auto inDims = ImageDimensions(GetInputSampleLayout(0), m_imageLayoutKind);
if (inputSampleLayout.GetWidth() < m_windowWidth || inputSampleLayout.GetHeight() < m_windowHeight)
if (isFinalValidationPass && (inDims.m_width < m_windowWidth || inDims.m_height < m_windowHeight))
InvalidArgument("PoolingNodeBase: inputWidth must >= windowWidth and inputHeight must >= windowHeight.");
// determine output tensor shape
auto outputSampleLayout = ImageLayoutWHC(
(inputSampleLayout.GetWidth() - m_windowWidth) / m_horizontalSubsample + 1,
(inputSampleLayout.GetHeight() - m_windowHeight) / m_verticalSubsample + 1,
inputSampleLayout.GetNumChannels());
auto outDims = ImageDimensions(
(inDims.m_width - m_windowWidth) / m_horizontalSubsample + 1,
(inDims.m_height - m_windowHeight) / m_verticalSubsample + 1,
inDims.m_numChannels);
m_inputSizePerSample = inputSampleLayout.GetWidth() * inputSampleLayout.GetHeight() * inputSampleLayout.GetNumChannels();
//m_outputSizePerSample = outputSampleLayout.GetWidth() * outputSampleLayout.GetHeight() * outputSampleLayout.GetNumChannels();
m_inputSizePerSample = inDims.m_width * inDims.m_height * inDims.m_numChannels;
if (Input(0)->GetNumRows() == 0)
ValidateInferInputDims(0, m_inputSizePerSample, Input(0)->GetNumCols()); // TODO: We should infer a tensor dimension for the input instead.
@ -409,18 +450,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (isFinalValidationPass && Input(0)->GetNumRows() != m_inputSizePerSample) // TODO: Can be removed once tensor shape and numRows are perfectly in sync.
LogicError("each column of input to the MaxPooling node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels", NodeName().c_str(), (int)m_inputSizePerSample);
SetDims(outputSampleLayout, Input(0)->GetNumCols());
SetDims(outDims.AsTensorShape(m_imageLayoutKind), Input(0)->GetNumCols());
// set up various engines and descriptor objects
// REVIEW alexeyk: is there a better place to create engines?
if (m_factory == nullptr)
m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId);
if (m_poolEng == nullptr)
m_poolEng = m_factory->CreatePoolEngine(m_deviceId);
if (m_inT == nullptr)
m_inT = m_factory->CreateTensor(inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels(), 1);
if (m_outT == nullptr)
m_outT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
if (isFinalValidationPass)
{
// set up various engines and descriptor objects
// REVIEW alexeyk: is there a better place to create engines?
assert(m_factory);
//if (m_factory == nullptr)
// m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
if (m_poolEng == nullptr)
m_poolEng = m_factory->CreatePoolEngine(m_deviceId);
if (m_inT == nullptr)
m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
if (m_outT == nullptr)
m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
}
}
void DumpNodeInfo(const bool printValues, File& fstream) const override
@ -430,27 +475,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto inputSampleLayout = GetInputSampleLayout(0);
char str[4096];
sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu] \n", inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels());
sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu] \n", inputSampleLayout[1], inputSampleLayout[2], inputSampleLayout[0]);
fstream << string(str);
sprintf(str, "PoolingWindow[Width:%lu, Height:%lu] SubSampling[Horizontal:%lu, Vertical:%lu]\n", m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample);
fstream << string(str);
sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu] \n", m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels());
sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu] \n", m_sampleLayout[1], m_sampleLayout[2], m_sampleLayout[0]);
fstream << string(str);
sprintf(str, "TotalSizePerSample[Input:%lu, Output:%lu] \n", m_inputSizePerSample, m_outputSizePerSample);
fstream << string(str);
}
protected:
size_t m_windowWidth, m_windowHeight;
size_t m_horizontalSubsample, m_verticalSubsample;
size_t m_inputSizePerSample, m_outputSizePerSample;
ImageLayoutKind m_imageLayoutKind; // how to interpret the tensor (which dimensions are X/Y and C)
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
std::unique_ptr<PoolingEngine<ElemType>> m_poolEng;
std::unique_ptr<ConvolutionTensor4D> m_inT;
std::unique_ptr<ConvolutionTensor4D> m_outT;
std::unique_ptr<PoolingDescriptor> m_poolDesc;
size_t m_windowWidth, m_windowHeight;
size_t m_horizontalSubsample, m_verticalSubsample;
size_t m_inputSizePerSample, m_outputSizePerSample;
};
// add this at the start of each derived class, to get access to the members of ComputationNode
@ -471,8 +518,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static const std::wstring TypeName() { return L"MaxPooling"; }
public:
MaxPoolingNode(DEVICEID_TYPE deviceId, const wstring & name) : Base(deviceId, name) { }
MaxPoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) :
Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample)
MaxPoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind) :
Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind)
{ }
MaxPoolingNode(const ScriptableObjects::IConfigRecordPtr configp) :
Base(configp)
@ -481,7 +528,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
if (m_poolDesc == nullptr)
if (isFinalValidationPass && m_poolDesc == nullptr)
m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Max, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
}
};
@ -500,8 +547,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static const std::wstring TypeName() { return L"AveragePooling"; }
public:
AveragePoolingNode(DEVICEID_TYPE deviceId, const wstring & name) : Base(deviceId, name) { }
AveragePoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) :
Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample)
AveragePoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind) :
Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind)
{ }
AveragePoolingNode(const ScriptableObjects::IConfigRecordPtr configp) :
Base(configp)
@ -525,7 +572,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
if (m_poolDesc == nullptr)
if (isFinalValidationPass && m_poolDesc == nullptr)
m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Average, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
}
};
@ -573,6 +620,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Read and check version.
// REVIEW alexeyk: extract version checking so it can be re-used in other places.
// BUGBUG: We must serialize m_inputLayout.
int32_t verWritten;
int32_t verReadable;
fstream >> verWritten >> verReadable;
@ -683,18 +731,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SetDims(Input(0));
if (m_factory == nullptr)
m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId);
if (m_convEng == nullptr)
m_convEng = m_factory->CreateConvEngine(m_deviceId, 0);
if (m_inT == nullptr)
m_inT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
if (m_scaleBiasT == nullptr)
if (isFinalValidationPass)
{
if (m_spatial)
m_scaleBiasT = m_factory->CreateTensor(1, 1, m_sampleLayout.GetNumChannels(), 1);
else
m_scaleBiasT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
const auto m_imageLayoutKind = ImageLayoutKind::CHW; // BUGBUG: Finish this. Must be serialized.
auto dims = ImageDimensions(GetSampleLayout(), m_imageLayoutKind);
if (m_factory == nullptr)
m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
if (m_convEng == nullptr)
m_convEng = m_factory->CreateConvEngine(m_deviceId, 0);
if (m_inT == nullptr)
m_inT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
if (m_scaleBiasT == nullptr)
{
if (m_spatial)
m_scaleBiasT = m_factory->CreateTensor(1, 1, dims.m_numChannels, 1);
else
m_scaleBiasT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
}
}
}
@ -740,11 +794,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
VersionInfo m_version;
private:
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
std::unique_ptr<ConvolutionTensor4D> m_inT;
std::unique_ptr<ConvolutionTensor4D> m_scaleBiasT;
// Determines whether to use training or inference(evaluation) mode.
bool m_eval;
// Determines whether to use per-activation (used after non-convolutional layers like fully connected)
@ -760,6 +809,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
shared_ptr<Matrix<ElemType>> m_dScale;
// Stores bias derivatives.
shared_ptr<Matrix<ElemType>> m_dBias;
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
std::unique_ptr<ConvolutionTensor4D> m_inT;
std::unique_ptr<ConvolutionTensor4D> m_scaleBiasT;
};
template class BatchNormalizationNode<float>;

Просмотреть файл

@ -18,6 +18,635 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// This header collects special-purpose nodes.
// It is likely that these are no longer functional.
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
// -----------------------------------------------------------------------
// ScaleNode (scalar scaling factor, matrix)
//
// Identical to ElementTimesNode with tensor lib (broadcasting). Can be removed.
// -----------------------------------------------------------------------
template<class ElemType>
class ScaleNode : public ComputationNode<ElemType>, public NumInputs<2>
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName() { return L"Scale"; }
public:
DeclareConstructorFromConfigWithNumInputs(ScaleNode);
ScaleNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
#ifdef ENABLE_TENSORVIEW // This takes a big perf hit since our reduction uses only a single thread in this case. Needs to be fixed.
size_t rank = DetermineElementwiseTensorRank();
auto gradient = GradientTensorFor(rank, fr);
auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
auto otherInputValue = Input(1 - inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
// if reduction then mask the respective input(s) (zero out the gaps)
if (Input(inputIndex)->GetNumCols() < GetNumCols())
MaskMissingGradientColumnsToZero(fr);
if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
inputGradient.AddElementwiseProductOf(gradient, otherInputValue);
#else
if (inputIndex == 0) // left derivative
{
// this is a reduction over frames, so we must mask gaps to zero
Input(0)->Gradient() += Matrix<ElemType>::InnerProductOfMatrices(MaskedGradientFor(fr), Input(1)->MaskedValueFor(fr)); // element-wise product summed up over all
}
else if (inputIndex == 1) // right derivative
{
Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
Matrix<ElemType>::Multiply1x1AndWeightedAdd(+1.0f, Input(0)->Value()/*1x1*/, GradientFor(fr), 1.0f, sliceInput1Grad);
}
#endif
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The ScaleNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
#ifdef ENABLE_TENSORVIEW
static int c = 0; if (c++ == 0) { fprintf(stderr, "#SCALE#\n"); }
size_t rank = DetermineElementwiseTensorRank();
auto result = ValueTensorFor(rank, fr);
auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
result.AssignElementwiseProductOf(input0, input1);
#else
ValueFor(fr).Assign1x1ProductOf(Input(0)->Value()/*1x1*/, Input(1)->ValueFor(fr));
#endif
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase();
// left node must be a scalar
if (isFinalValidationPass && (Input(0)->GetNumRows() != 1 || Input(0)->GetNumCols() != 1))
RuntimeError("The left value of ScaleNode must be a scalar value.");
SetDims(Input(1));
}
};
template class ScaleNode<float>;
template class ScaleNode<double>;
// -----------------------------------------------------------------------
// RowElementTimesNode (left, right) --TODO: what are left and right?
//
// TODO: This is subsumed by ElementTimes with tensor lib.
// -----------------------------------------------------------------------
template<class ElemType>
class RowElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName() { return L"RowElementTimes"; }
public:
DeclareConstructorFromConfigWithNumInputs(RowElementTimesNode);
RowElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
void BackpropToMap(const size_t inputIndex)
{
if (inputIndex > 1)
InvalidArgument("RowElementTimes operation only takes two inputs.");
if (inputIndex == 0)
{
BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
}
else
{
BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
}
}
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
Matrix<ElemType> sliceInput1Value = Input(1 - inputIndex)->ValueFor(fr);
if (inputIndex == 0)
{
BackpropToLeftS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
}
else
{
BackpropToRightS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
}
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The RowElementTimesNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
//left (input 0) is a matrix
/*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
Matrix<ElemType>& input0GradientValues,
const Matrix<ElemType>& gradientValues,
Matrix<ElemType>& tempMatrix)
{
tempMatrix.SetValue(gradientValues);
tempMatrix.RowElementMultiplyWith(input1FunctionValues);
input0GradientValues += tempMatrix;
#if NANCHECK
input0GradientValues.HasNan("RowElementTimes");
#endif
}
//right (input 1) is a row vector
/*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
Matrix<ElemType>& input1GradientValues,
const Matrix<ElemType>& gradientValues,
Matrix<ElemType>& tempMatrix)
{
tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, true);
input1GradientValues += tempMatrix;
#if NANCHECK
input1GradientValues.HasNan("RowElementTimes");
#endif
}
void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
{
ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
}
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
//if (fr.IsAllFrames()) { ForwardPropMap(); return; }
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
ForwardPropS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
}
/*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
{
functionValues.SetValue(input0);
functionValues.RowElementMultiplyWith(input1);
#if NANCHECK
functionValues.HasNan("RowElementTimes");
#endif
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase();
size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); rows0;
if (isFinalValidationPass && cols0 != cols1 || rows1 != 1)
LogicError("RowElementTimes: Either the second operand is not a row vector or the number of columns of operands does not match.");
SetDims(Input(0));
}
//request matrices that are needed for gradient computation
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
{
Base::RequestMatricesBeforeBackprop(matrixPool);
RequestMatrixFromPool(m_tempMatrix, matrixPool);
}
//release gradient and temp matrices that no longer needed after all the children's gradients are computed.
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
}
private:
shared_ptr<Matrix<ElemType>> m_tempMatrix;
};
template class RowElementTimesNode<float>;
template class RowElementTimesNode<double>;
// -----------------------------------------------------------------------
// ColumnElementTimesNode (left, right) --TODO: what are left and right?
//
// TODO: This is subsumed by ElementTimes with tensor lib.
// -----------------------------------------------------------------------
template<class ElemType>
class ColumnElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName() { return L"ColumnElementTimes"; }
public:
DeclareConstructorFromConfigWithNumInputs(ColumnElementTimesNode);
ColumnElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
void BackpropToMap(const size_t inputIndex)
{
if (inputIndex > 1)
InvalidArgument("ColumnElementTimes operation only takes two inputs.");
if (inputIndex == 0)
{
BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
}
else
{
BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
}
}
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
if (inputIndex == 0)
{
Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
BackpropToLeftS(Input(1)->Value(), sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
}
else
{
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
BackpropToRightS(sliceInput0Value, Input(1)->Gradient(), sliceOutputGrad, *m_tempMatrix);
}
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The ColumnElementTimesNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
//left (input 0) is a matrix
/*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
Matrix<ElemType>& input0GradientValues,
const Matrix<ElemType>& gradientValues,
Matrix<ElemType>& tempMatrix)
{
tempMatrix.SetValue(gradientValues);
tempMatrix.ColumnElementMultiplyWith(input1FunctionValues);
input0GradientValues += tempMatrix;
#if NANCHECK
input0GradientValues.HasNan("ColumnElementTimes");
#endif
}
//right (input 1) is a col vector
/*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
Matrix<ElemType>& input1GradientValues,
const Matrix<ElemType>& gradientValues,
Matrix<ElemType>& tempMatrix)
{
tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, false);
input1GradientValues += tempMatrix;
#if NANCHECK
input1GradientValues.HasNan("ColumnElementTimes");
#endif
}
void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
{
ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
}
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
//if (fr.IsAllFrames()) { ForwardPropMap(); return; }
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value());
}
/*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
{
functionValues.SetValue(input0);
functionValues.ColumnElementMultiplyWith(input1);
#if NANCHECK
functionValues.HasNan("ColumnElementTimes");
#endif
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase();
//derive number of rows if possible
for (size_t index = 0; index < 2; index++)
{
size_t rows = Input(index)->GetNumRows() == 0 ? Input(1 - index)->GetNumRows() : Input(index)->GetNumRows();
size_t cols = Input(index)->GetNumCols() == 0 ? Input(1 - index)->GetNumCols() : Input(index)->GetNumCols();
ValidateInferInputDims(index, rows, cols);
}
size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); cols0;
if (isFinalValidationPass && (rows0 != rows1 || cols1 != 1))
LogicError("ColumnElementTimes: Either the second operand is not a column vector or the number of rows of operands does not match.");
SetDims(Input(0));
}
//request matrices that are needed for gradient computation
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
{
Base::RequestMatricesBeforeBackprop(matrixPool);
RequestMatrixFromPool(m_tempMatrix, matrixPool);
}
//release gradient and temp matrices that no longer needed after all the children's gradients are computed.
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
}
private:
shared_ptr<Matrix<ElemType>> m_tempMatrix;
};
template class ColumnElementTimesNode<float>;
template class ColumnElementTimesNode<double>;
// -----------------------------------------------------------------------
// RectifiedLinearNode (input) -- ReLU non-linearity
// -----------------------------------------------------------------------
template<class ElemType>
class RectifiedLinearNode : public SoftmaxNodeBase<ElemType>
{
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
static const std::wstring TypeName() { return L"RectifiedLinear"; }
public:
DeclareConstructorFromConfigWithNumInputs(RectifiedLinearNode);
RectifiedLinearNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override
{
gradient.AssignLinearRectifierDerivativeOf(inputFunctionValues);
#if DUMPOUTPUT
inputGradientValues.Print("RecitifiedLinearNode-Partial-in");
#endif
inputGradientValues.AddElementProductOf(gradientValues, gradient);
#if DUMPOUTPUT
inputGradientValues.Print("RecitifiedLinearNode-Partial-out");
#endif
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The ReLU node does not require its output value for computing
// the gradients of its input nodes
return false;
}
void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignTruncateBottomOf(inputFunctionValues, 0);
#if DUMPOUTPUT
functionValues.Print("RectifiedLinearNode");
#endif
}
};
template class RectifiedLinearNode<float>;
template class RectifiedLinearNode<double>;
// -----------------------------------------------------------------------
// SigmoidNode (input) -- sigmoid non-linearity
// -----------------------------------------------------------------------
template<class ElemType>
class SigmoidNode : public SoftmaxNodeBase<ElemType>
{
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
static const std::wstring TypeName() { return L"Sigmoid"; }
public:
DeclareConstructorFromConfigWithNumInputs(SigmoidNode);
SigmoidNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
{
// The Sigmoid node does not require any of it's input's values for computing
// the gradients of its input nodes
UNREFERENCED_PARAMETER(childIndex);
return false;
}
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
{
gradient.AssignSigmoidDerivativeOf(functionValues);
inputGradientValues.AddElementProductOf(gradientValues, gradient);
}
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignSigmoidOf(inputFunctionValues);
}
};
template class SigmoidNode<float>;
template class SigmoidNode<double>;
// -----------------------------------------------------------------------
// TanhNode (input) -- tanh non-linearity
// -----------------------------------------------------------------------
template<class ElemType>
class TanhNode : public SoftmaxNodeBase<ElemType>
{
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
static const std::wstring TypeName() { return L"Tanh"; }
public:
DeclareConstructorFromConfigWithNumInputs(TanhNode);
TanhNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
{
// The plus node does not require any of it's input's values for computing
// the gradients of its input nodes
UNREFERENCED_PARAMETER(childIndex);
return false;
}
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
{
gradient.AssignElementProductOf(functionValues, functionValues); // v .* v
gradient.AssignDifferenceOf(1, gradient); // 1-v^2
inputGradientValues.AddElementProductOf(gradientValues, gradient); // += d .* ((1-v) .* v))
}
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignTanhOf(inputFunctionValues);
}
};
template class TanhNode<float>;
template class TanhNode<double>;
// -----------------------------------------------------------------------
// LogNode (input) -- component-wise log() of input
// -----------------------------------------------------------------------
template<class ElemType>
class LogNode : public SoftmaxNodeBase<ElemType>
{
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
static const std::wstring TypeName() { return L"Log"; }
public:
DeclareConstructorFromConfigWithNumInputs(LogNode);
LogNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The plus node does not require its output value for computing
// the gradients of its input nodes
return false;
}
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
{
gradient.AssignElementInverseOf(inputFunctionValues); // 1/x (x is input to log(x))
inputGradientValues.AddElementProductOf(gradientValues, gradient);
// TODO: with tensor lib:
//inputGradientValues.AddElementDivisionOf(gradientValues, inputFunctionValues); // 1/x (x is input to log(x))
}
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignLogOf(inputFunctionValues);
}
};
template class LogNode<float>;
template class LogNode<double>;
// -----------------------------------------------------------------------
// ExpNode (input) -- component-wise exp() of input
// -----------------------------------------------------------------------
template<class ElemType>
class ExpNode : public SoftmaxNodeBase<ElemType>
{
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
static const std::wstring TypeName() { return L"Exp"; }
public:
DeclareConstructorFromConfigWithNumInputs(ExpNode);
ExpNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
assert(inputIndex == 0); inputIndex;
Matrix<ElemType> sliceInputGrad = Input(0)->GradientFor(fr);
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
m_gradientTemp->AssignExpOf(sliceInputValue); // Exp(x) is its own partial
sliceInputGrad.AddElementProductOf(sliceOutputGrad, *m_gradientTemp);
// TODO: with tensor lib:
// sliceInputGrad.AddElementProductOf(sliceOutputGrad, functionValues);
// and set OutputUsed
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The ExpNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
virtual void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override { NOT_IMPLEMENTED; } // not needed
void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignExpOf(inputFunctionValues);
}
};
template class ExpNode<float>;
template class ExpNode<double>;
// -----------------------------------------------------------------------
// CosineNode (input) -- component-wise cos() of input
// -----------------------------------------------------------------------
template<class ElemType>
class CosineNode : public SoftmaxNodeBase<ElemType>
{
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
static const std::wstring TypeName() { return L"Cosine"; }
public:
DeclareConstructorFromConfigWithNumInputs(CosineNode);
CosineNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The CosineNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
{
gradient.AssignNegativeSineOf(inputFunctionValues); // -sin(x) (x is input to Cosine(x))
inputGradientValues.AddElementProductOf(gradientValues, gradient);
// TODO: tensor lib: make a joint kernel, since neg sin is never used for anything else
}
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignCosineOf(inputFunctionValues);
}
};
template class CosineNode<float>;
template class CosineNode<double>;
#endif
// -----------------------------------------------------------------------
/// DummyCriterionNode (objectives, derivatives, prediction)
// -----------------------------------------------------------------------

Просмотреть файл

@ -28,6 +28,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// LearnableParameter (/*no input*/)
// represents weight matrices and biases
// TODO: add -Node to the class name
// -----------------------------------------------------------------------
template<class ElemType>
@ -42,18 +43,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_parameterUpdateRequired = true;
SetDims(TensorShape(), 0);
}
LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & shape) :
Base(deviceId, name)
{
m_parameterUpdateRequired = true;
CreateMatrixIfNull(m_value);
SetDims(TensorShape(rows), cols);
// for now we split off the trailing dimension into the matrix column dimension
// TODO: This is for compat, but is is inconsistent. Decide what a sample layout means for a node without MBLayout w.r.t. non-tensor ops.
auto dims = shape.GetDims();
size_t cols = 1;
if (dims.size() > 1)
{
cols = dims.back();
dims.resize(dims.size()-1);
}
SetDims(TensorShape(dims), cols);
UpdateFunctionValuesSize(); // this allocates the matrix
Value().SetValue(0);
}
LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
LearnableParameter(deviceId, name, TensorShape(rows, cols))
{ }
LearnableParameter(const ScriptableObjects::IConfigRecordPtr configp) :
LearnableParameter(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"rows"), configp->Get(L"cols"))
LearnableParameter(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"shape"))
{
// TODO: Change dimensions to take a generic tensor instead. That will be a (minor) breaking change that will require fix-ups when converting from NDL to BrainScript.
AttachInputs(configp, this->GetExpectedNumInputs());
// parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float])
// TODO: "needGradient" should be renamed to better match m_parameterUpdateRequired
@ -83,7 +97,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
Base::Save(fstream);
fstream << m_parameterUpdateRequired;
fstream << GetNumRows() << GetNumCols();
fstream << (size_t)0/*#rows in a legacy file format*/ << GetNumCols();
m_sampleLayout.Save(fstream);
fstream << Value();
}
@ -95,8 +110,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
fstream >> m_parameterUpdateRequired;
fstream >> rows >> cols;
SetDims(TensorShape(rows), cols);
TensorShape sampleLayout;
if (rows != 0) // legacy file format
sampleLayout = TensorShape(rows);
else
sampleLayout.Load(fstream, /*acceptLegacyFormat=*/true);
LoadValue(fstream);
SetDims(sampleLayout, cols); // note: call this after LoadValue() since LoadValue() overwrites m_sampleLayout
}
// initialize with random numbers
@ -106,13 +126,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
bool initOnCPUOnly) // if true then always init on CPU, making initialization consistent across both (for testing)
{
size_t inputSize = GetNumCols();
//fprintf(stderr, "%d x %d: %d %ls\n", (int)GetNumRows(), (int)GetNumCols(), (int)randomSeed, NodeName().c_str());
// the random seed offset is set via the "randomSeedOffset" parameter in config
if (initOnCPUOnly)
m_value->TransferToDeviceIfNotThereAndNotAutoPlace(CPUDEVICE, true);
if (uniformInit)
{
ElemType randRange = 0.05f * initValueScale; //initValueScale/sqrt(inputSize);
// TODO: move these crazy extra factors out from here and into NDL, and make them visible in BS
ElemType randRange = 0.05f * initValueScale;
Value().SetUniformRandomValue(-randRange, randRange, randomSeed);
}
else
@ -221,6 +243,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// InputValueBase (/*no input*/)
// Base class for InputValue and SparseInputValue (typically fed by a DataReader)
// this covers four types: (regular vs. image) x (non-sparse vs. sparse)
// TODO: add -Node to the class names
// -----------------------------------------------------------------------
template<class ElemType>
@ -228,59 +251,47 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
void Init(const TensorShape & sampleLayout, size_t cols, bool isSparse)
void Init(const TensorShape & sampleLayout, bool isSparse)
{
m_isSparse = isSparse;
CreateMatrixIfNull(m_value);
if (isSparse)
ConvertToSparseMatrix();
SetDims(sampleLayout, cols);
SetDims(sampleLayout, 0);
UpdateFunctionValuesSize(); // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
m_parameterUpdateRequired = false;
}
protected:
InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout, bool isSparse) :
Base(deviceId, name)
{
Init(sampleLayout, isSparse);
}
InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, bool isSparse) :
InputValueBase(deviceId, name, TensorShape(rows), isSparse)
{ }
InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, bool isSparse) :
Base(deviceId, name)
{
Init(TensorShape(), 0, isSparse);
}
InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols, bool isSparse) :
Base(deviceId, name)
{
Init(TensorShape(rows), cols, isSparse);
}
InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout, size_t numImages, bool isSparse) :
Base(deviceId, name)
{
size_t cols = numImages;
Init(imageLayout, cols, isSparse);
}
InputValueBase(deviceId, name, TensorShape(), isSparse)
{ }
InputValueBase(const ScriptableObjects::IConfigRecordPtr configp, bool isSparse) :
Base(configp->Get(L"deviceId"), L"<placeholder>")
{
AttachInputs(configp, this->GetExpectedNumInputs());
bool isImage = configp->Get(L"isImage");
if (!isImage)
{
size_t rows = configp->Get(L"rows");
size_t cols = configp->Get(L"cols");
Init(TensorShape(rows), cols, isSparse); // no tensor, just a vector
}
Init(configp->Get(L"shape"), isSparse);
else
{
size_t cols = configp->Get(L"numImages"); // This is actually the MB size. --TODO: No need to specify it?
Init(ImageLayoutWHC(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels")), cols, isSparse);
}
Init(ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))), isSparse);
}
public:
virtual void Save(File& fstream) const override
{
Base::Save(fstream);
size_t rows = GetNumRows(); // using explicitly typed variables to be 100% symmetrical to Load()
size_t cols = m_pMBLayout ? 0 : GetNumCols(); // if this Input depends on MB size, we write it as having 0 dimensions
fstream << rows << cols;
size_t rows = GetNumRows(); // using explicitly typed variables to be 100% symmetrical to Load()
size_t colsDummy = 0; // This should not be saved. InputValues always are minibatches.
fstream << rows << colsDummy;
m_sampleLayout.Save(fstream);
}
@ -288,13 +299,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
Base::Load(fstream, modelVersion);
size_t rows, cols;
fstream >> rows >> cols;
// some older files retained the #columns when saving, which is meaningless
if (m_pMBLayout)
cols = 0;
size_t rows, colsDummy;
fstream >> rows >> colsDummy;
TensorShape sampleLayout;
sampleLayout.Load(fstream);
sampleLayout.Load(fstream, /*acceptLegacyFormat=*/true);
// some older files may have inconsistent tensor information
if (rows != sampleLayout.GetNumElements())
{
@ -302,7 +310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
NodeName().c_str(), string(sampleLayout).c_str(), (int)rows);
sampleLayout = TensorShape(rows);
}
Init(sampleLayout, cols, m_isSparse);
Init(sampleLayout, m_isSparse);
}
// InputValue must not resize its inputs because that might destroy it. It should already have the correct size.
@ -347,11 +355,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
InputValue(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name, false)
{ }
InputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
Base(deviceId, name, rows, cols, false)
InputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows) :
Base(deviceId, name, rows, false)
{ }
InputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout, size_t numImages) :
Base(deviceId, name, imageLayout, numImages, false)
InputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout) :
Base(deviceId, name, sampleLayout, false)
{ }
InputValue(const ScriptableObjects::IConfigRecordPtr configp) :
Base(configp, false)
@ -376,11 +384,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name, true)
{ }
SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
Base(deviceId, name, rows, cols, true)
SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows) :
Base(deviceId, name, rows, true)
{ }
SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout, size_t numImages) :
Base(deviceId, name, imageLayout, numImages, true)
SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout) :
Base(deviceId, name, imageLayout, true)
{ }
SparseInputValue(const ScriptableObjects::IConfigRecordPtr configp) :
Base(configp, true)

Просмотреть файл

@ -6,10 +6,10 @@
#pragma once
#include "Basics.h"
#include "Matrix.h"
#include "TensorView.h"
#include "ComputationNode.h"
#include "ConvolutionalNodes.h"
#include "Matrix.h"
#include "TensorView.h"
#include <unordered_set>
#include <map>
@ -44,7 +44,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
#ifdef ENABLE_TENSORVIEW
// BUGBUG: This gives us a huge perf hit for Image/QuickE2E.
static int c = 0; if (c++ == 0) { fprintf(stderr, "#PLUSBP#\n"); }
size_t rank = DetermineElementwiseTensorRank();
auto gradient = GradientTensorFor(rank, fr);
auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
@ -53,7 +53,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (Input(inputIndex)->GetNumCols() < GetNumCols())
MaskMissingGradientColumnsToZero(fr);
inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
inputGradient.AddCopyOf(gradient);
#else
Matrix<ElemType> gradientValues = GradientFor(fr);
Matrix<ElemType> functionValues = ValueFor(fr);
@ -124,11 +124,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
#ifdef ENABLE_TENSORVIEW
static int c = 0; if (c++ == 0) { fprintf(stderr, "#PLUS#\n"); }
size_t rank = DetermineElementwiseTensorRank();
auto result = ValueTensorFor(rank, fr);
auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
result.DoSumOf(0.0f, input0, input1, 1.0f);
result.AssignSumOf(input0, input1);
#else
Matrix<ElemType> functionValues = ValueFor(fr);
Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
@ -223,10 +224,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (Input(inputIndex)->GetNumCols() < GetNumCols())
MaskMissingGradientColumnsToZero(fr);
if (sign > 0)
inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
else
inputGradient.DoDifferenceOf(0.0f, inputGradient, gradient, 1.0f);
inputGradient.AddCopyOf(gradient, sign);
#else
Matrix<ElemType> gradientValues = GradientFor(fr);
@ -269,12 +267,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
#ifdef ENABLE_TENSORVIEW
static int c = 0; if (c++ == 0) { fprintf(stderr,"#MINUS#"); }
static int c = 0; if (c++ == 0) { fprintf(stderr,"#MINUS#\n"); }
size_t rank = DetermineElementwiseTensorRank();
auto result = ValueTensorFor(rank, fr);
auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
result.DoDifferenceOf(0.0f, input0, input1, 1.0f);
result.AssignDifferenceOf(input0, input1);
#else
Matrix<ElemType> functionValues = ValueFor(fr);
Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
@ -307,91 +305,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template class MinusNode<float>;
template class MinusNode<double>;
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
// -----------------------------------------------------------------------
// ScaleNode (scalar scaling factor, matrix)
//
// Identical to ElementTimesNode with tensor lib (broadcasting). Can be removed.
// -----------------------------------------------------------------------
template<class ElemType>
class ScaleNode : public ComputationNode<ElemType>, public NumInputs<2>
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName() { return L"Scale"; }
public:
DeclareConstructorFromConfigWithNumInputs(ScaleNode);
ScaleNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
#if 0//def ENABLE_TENSORVIEW // This takes a big perf hit since our reduction uses only a single thread in this case. Needs to be fixed.
size_t rank = DetermineElementwiseTensorRank();
auto gradient = GradientTensorFor(rank, fr);
auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
auto otherInputValue = Input(1 - inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
// if reduction then mask the respective input(s) (zero out the gaps)
if (Input(inputIndex)->GetNumCols() < GetNumCols())
MaskMissingGradientColumnsToZero(fr);
if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
inputGradient.DoElementwiseProductOf(1.0f/*add to*/, gradient, otherInputValue, 1.0f);
#else
if (inputIndex == 0) // left derivative
{
// this is a reduction over frames, so we must mask gaps to zero
Input(0)->Gradient() += Matrix<ElemType>::InnerProductOfMatrices(MaskedGradientFor(fr), Input(1)->MaskedValueFor(fr)); // element-wise product summed up over all
}
else if (inputIndex == 1) // right derivative
{
Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
Matrix<ElemType>::Multiply1x1AndWeightedAdd(+1.0f, Input(0)->Value()/*1x1*/, GradientFor(fr), 1.0f, sliceInput1Grad);
}
#endif
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The ScaleNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
#ifdef ENABLE_TENSORVIEW
static int c = 0; if (c++ == 0) { fprintf(stderr, "#SCALE#"); }
size_t rank = DetermineElementwiseTensorRank();
auto result = ValueTensorFor(rank, fr);
auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
result.DoElementwiseProductOf(0.0f, input0, input1, 1.0f);
#else
ValueFor(fr).Assign1x1ProductOf(Input(0)->Value()/*1x1*/, Input(1)->ValueFor(fr));
#endif
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase();
// left node must be a scalar
if (isFinalValidationPass && (Input(0)->GetNumRows() != 1 || Input(0)->GetNumCols() != 1))
RuntimeError("The left value of ScaleNode must be a scalar value.");
SetDims(Input(1));
}
};
template class ScaleNode<float>;
template class ScaleNode<double>;
#endif
// -----------------------------------------------------------------------
// NegateNode (input)
// computes the negative of its input
@ -707,7 +620,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
inputGradient.DoElementwiseProductOf(1.0f/*add to*/, gradient, otherInputValue, 1.0f);
inputGradient.AddElementwiseProductOf(gradient, otherInputValue);
#else
Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
@ -725,12 +638,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
#ifdef ENABLE_TENSORVIEW
static int c = 0; if (c++ == 0) { fprintf(stderr,"#ETIMES#"); }
static int c = 0; if (c++ == 0) { fprintf(stderr,"#ETIMES#\n"); }
size_t rank = DetermineElementwiseTensorRank();
auto result = ValueTensorFor(rank, fr);
auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
result.DoElementwiseProductOf(0.0f, input0, input1, 1.0f);
result.AssignElementwiseProductOf(input0, input1);
#else
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
@ -745,303 +658,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template class ElementTimesNode<float>;
template class ElementTimesNode<double>;
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
// -----------------------------------------------------------------------
// RowElementTimesNode (left, right) --TODO: what are left and right?
//
// TODO: This is subsumed by ElementTimes with tensor lib.
// -----------------------------------------------------------------------
template<class ElemType>
class RowElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName() { return L"RowElementTimes"; }
public:
DeclareConstructorFromConfigWithNumInputs(RowElementTimesNode);
RowElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
void BackpropToMap(const size_t inputIndex)
{
if (inputIndex > 1)
InvalidArgument("RowElementTimes operation only takes two inputs.");
if (inputIndex == 0)
{
BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
}
else
{
BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
}
}
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
Matrix<ElemType> sliceInput1Value = Input(1 - inputIndex)->ValueFor(fr);
if (inputIndex == 0)
{
BackpropToLeftS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
}
else
{
BackpropToRightS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
}
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The RowElementTimesNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
//left (input 0) is a matrix
/*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
Matrix<ElemType>& input0GradientValues,
const Matrix<ElemType>& gradientValues,
Matrix<ElemType>& tempMatrix)
{
tempMatrix.SetValue(gradientValues);
tempMatrix.RowElementMultiplyWith(input1FunctionValues);
input0GradientValues += tempMatrix;
#if NANCHECK
input0GradientValues.HasNan("RowElementTimes");
#endif
}
//right (input 1) is a row vector
/*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
Matrix<ElemType>& input1GradientValues,
const Matrix<ElemType>& gradientValues,
Matrix<ElemType>& tempMatrix)
{
tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, true);
input1GradientValues += tempMatrix;
#if NANCHECK
input1GradientValues.HasNan("RowElementTimes");
#endif
}
void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
{
ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
}
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
//if (fr.IsAllFrames()) { ForwardPropMap(); return; }
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
ForwardPropS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
}
/*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
{
functionValues.SetValue(input0);
functionValues.RowElementMultiplyWith(input1);
#if NANCHECK
functionValues.HasNan("RowElementTimes");
#endif
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase();
size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); rows0;
if (isFinalValidationPass && cols0 != cols1 || rows1 != 1)
LogicError("RowElementTimes: Either the second operand is not a row vector or the number of columns of operands does not match.");
SetDims(Input(0));
}
//request matrices that are needed for gradient computation
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
{
Base::RequestMatricesBeforeBackprop(matrixPool);
RequestMatrixFromPool(m_tempMatrix, matrixPool);
}
//release gradient and temp matrices that no longer needed after all the children's gradients are computed.
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
}
private:
shared_ptr<Matrix<ElemType>> m_tempMatrix;
};
template class RowElementTimesNode<float>;
template class RowElementTimesNode<double>;
// -----------------------------------------------------------------------
// ColumnElementTimesNode (left, right) --TODO: what are left and right?
//
// TODO: This is subsumed by ElementTimes with tensor lib.
// -----------------------------------------------------------------------
template<class ElemType>
class ColumnElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName() { return L"ColumnElementTimes"; }
public:
DeclareConstructorFromConfigWithNumInputs(ColumnElementTimesNode);
ColumnElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
void BackpropToMap(const size_t inputIndex)
{
if (inputIndex > 1)
InvalidArgument("ColumnElementTimes operation only takes two inputs.");
if (inputIndex == 0)
{
BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
}
else
{
BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
}
}
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
if (inputIndex == 0)
{
Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
BackpropToLeftS(Input(1)->Value(), sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
}
else
{
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
BackpropToRightS(sliceInput0Value, Input(1)->Gradient(), sliceOutputGrad, *m_tempMatrix);
}
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The ColumnElementTimesNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
//left (input 0) is a matrix
/*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
Matrix<ElemType>& input0GradientValues,
const Matrix<ElemType>& gradientValues,
Matrix<ElemType>& tempMatrix)
{
tempMatrix.SetValue(gradientValues);
tempMatrix.ColumnElementMultiplyWith(input1FunctionValues);
input0GradientValues += tempMatrix;
#if NANCHECK
input0GradientValues.HasNan("ColumnElementTimes");
#endif
}
//right (input 1) is a col vector
/*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
Matrix<ElemType>& input1GradientValues,
const Matrix<ElemType>& gradientValues,
Matrix<ElemType>& tempMatrix)
{
tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, false);
input1GradientValues += tempMatrix;
#if NANCHECK
input1GradientValues.HasNan("ColumnElementTimes");
#endif
}
void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
{
ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
}
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
//if (fr.IsAllFrames()) { ForwardPropMap(); return; }
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value());
}
/*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
{
functionValues.SetValue(input0);
functionValues.ColumnElementMultiplyWith(input1);
#if NANCHECK
functionValues.HasNan("ColumnElementTimes");
#endif
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase();
//derive number of rows if possible
for (size_t index = 0; index < 2; index++)
{
size_t rows = Input(index)->GetNumRows() == 0 ? Input(1 - index)->GetNumRows() : Input(index)->GetNumRows();
size_t cols = Input(index)->GetNumCols() == 0 ? Input(1 - index)->GetNumCols() : Input(index)->GetNumCols();
ValidateInferInputDims(index, rows, cols);
}
size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); cols0;
if (isFinalValidationPass && (rows0 != rows1 || cols1 != 1))
LogicError("ColumnElementTimes: Either the second operand is not a column vector or the number of rows of operands does not match.");
SetDims(Input(0));
}
//request matrices that are needed for gradient computation
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
{
Base::RequestMatricesBeforeBackprop(matrixPool);
RequestMatrixFromPool(m_tempMatrix, matrixPool);
}
//release gradient and temp matrices that no longer needed after all the children's gradients are computed.
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
}
private:
shared_ptr<Matrix<ElemType>> m_tempMatrix;
};
template class ColumnElementTimesNode<float>;
template class ColumnElementTimesNode<double>;
#endif
// -----------------------------------------------------------------------
// DiagTimesNode (vector representing the diagonal of a square matrix, data)
// -----------------------------------------------------------------------
@ -1195,7 +811,6 @@ private:
{
Base::Validate(isFinalValidationPass);
m_pMBLayout = nullptr; // this node does not hold mini-batch data
SetDims(TensorShape(1), 1);
}
};
@ -1207,6 +822,7 @@ private:
// SumColumnElementsNode (input)
// sums up each column of the input
// TODO: This should be deprecated, in favor of a reduce node.
// TODO: Implement this with the tensor library.
// -----------------------------------------------------------------------
template<class ElemType>

Просмотреть файл

@ -5,6 +5,11 @@
//
#pragma once
#include "Basics.h"
#include "ComputationNode.h"
#include "Matrix.h"
#include "TensorView.h"
#include <unordered_set>
#include <map>
#include <string>
@ -18,27 +23,111 @@
#include <sstream>
#include <iostream>
#include "Basics.h"
#include "Matrix.h"
#include "ComputationNode.h"
namespace Microsoft { namespace MSR { namespace CNTK {
#ifdef ENABLE_TENSORVIEW
// -----------------------------------------------------------------------
// NonlinearityNodeBase (input) -- abstract base class that holds what's shared
// between non-linearity nodes like Sigmoid
// UnaryElementWiseWithOpCodeNodeBase (input) -- base for elementwise unary op
// where forward // and backward are single ElementWiseOperator opcodes and
// only inputs (but not // function values) are used.
// -----------------------------------------------------------------------
template<class ElemType, ElementWiseOperator opForward, ElementWiseOperator opBackward, bool gradientFromOutput>
class UnaryElementWiseWithOpCodeNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
public:
UnaryElementWiseWithOpCodeNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
static int c = 0; if (c++ == 0) { fprintf(stderr, "#NLop%d#\n", (int)opForward); }
size_t rank = DetermineElementwiseTensorRank();
auto result = ValueTensorFor(rank, fr);
auto input = Input(0)->ValueTensorFor(rank, fr);
result.DoUnaryOpOf(0, input, 1, opForward);
}
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
assert(inputIndex == 0); inputIndex;
// get the args
size_t rank = DetermineElementwiseTensorRank();
auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one...
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
auto sliceValue = gradientFromOutput ? ValueTensorFor(rank, fr) : // using input or output value
Input(0)->ValueTensorFor(rank, fr);
// If gradient can be compute from output rather than input, then that's better for mem sharing (and faster in most cases).
// Not possible for Cos().
sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward);
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
{
ValidateUnaryMap(isFinalValidationPass);
}
// We don't need our output values in backprop.
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
};
#define UnaryElementWiseWithOpCodeNodeBaseMembers UsingComputationNodeMembersBoilerplate;
// -----------------------------------------------------------------------
// SigmoidNode (input)
// TanhNode (input)
// RectifiedLinearNode (input)
// LogNode (input)
// ExpNode (input)
// CosineNode (input)
// These are all implemented by single-opcode functions and can thus be declared by a macro.
// -----------------------------------------------------------------------
#pragma push_macro("DeclareUnaryTensorOp")
#define DeclareUnaryElementWiseWithOpCodeNode(Name, Forward, Backward, gradientFromOutput) \
template<class ElemType> \
class Name ## Node : public UnaryElementWiseWithOpCodeNodeBase<ElemType, op ## Forward, op ## Backward, gradientFromOutput> \
{ \
typedef UnaryElementWiseWithOpCodeNodeBase<ElemType, op ## Forward, op ## Backward, gradientFromOutput> Base; UnaryElementWiseWithOpCodeNodeBaseMembers; \
static const std::wstring TypeName() { return L ## #Name; } \
public: \
DeclareConstructorFromConfigWithNumInputs(Name ## Node); \
Name ## Node(DEVICEID_TYPE deviceId, const wstring & Name) : \
Base(deviceId, Name) \
{ } \
}
// Name Forward and Backward opcodes
DeclareUnaryElementWiseWithOpCodeNode(Sigmoid, Sigmoid, ElementwiseProductWithSigmoidDerivativeFromOutput, true);
DeclareUnaryElementWiseWithOpCodeNode(Tanh, Tanh, ElementwiseProductWithTanhDerivativeFromOutput, true);
DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear, LinearRectifier, ElementwiseProductWithLinearRectifierDerivativeFromOutput, true);
DeclareUnaryElementWiseWithOpCodeNode(Log, Log, ElementwiseProductWithLogDerivativeFromOutput, true);
DeclareUnaryElementWiseWithOpCodeNode(Exp, Exp, ElementwiseProduct, true);
DeclareUnaryElementWiseWithOpCodeNode(Cosine, Cosine, ElementwiseProductWithCosDerivative, false);
#pragma pop_macro("DeclareUnaryTensorOp")
#endif
// -----------------------------------------------------------------------
// SoftmaxNodeBase (input) -- shared base of Softmax and LogSoftmax
// -----------------------------------------------------------------------
// shared base for all element-wise non-linearities
// What this adds over a ComputationNode<ElemType> is a member m_gradientTemp for temp use by derived classes.
// TODO: This was used more broadly, but no longer, so we may be able to simplify the signatures of the virtual functions.
template<class ElemType>
class NonlinearityNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
class SoftmaxNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
public:
//virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) = 0;
DeclareConstructorFromConfigWithNumInputs(NonlinearityNodeBase);
NonlinearityNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
DeclareConstructorFromConfigWithNumInputs(SoftmaxNodeBase);
SoftmaxNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
@ -54,7 +143,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto sliceOutputValue = OutputUsedInComputingInputNodesGradients() ? ValueFor(fr) : Matrix<ElemType>();
// do the actual operation
// TODO: Once all is unified then make the order of arguments more logical (in -> out)
BackpropToV(*m_gradientTemp, sliceInputValue, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
}
@ -80,7 +168,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Base::CopyTo(nodeP, newName, flags);
if (flags & CopyNodeFlags::copyNodeValue)
{
auto node = dynamic_pointer_cast<NonlinearityNodeBase<ElemType>>(nodeP);
auto node = dynamic_pointer_cast<SoftmaxNodeBase<ElemType>>(nodeP);
*node->m_gradientTemp = *m_gradientTemp;
}
}
@ -102,296 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
shared_ptr<Matrix<ElemType>> m_gradientTemp;
};
#define UsingNonlinearityNodeBaseMembers UsingComputationNodeMembersBoilerplate; using Base::m_gradientTemp
// -----------------------------------------------------------------------
// RectifiedLinearNode (input) -- ReLU non-linearity
// -----------------------------------------------------------------------
template<class ElemType>
class RectifiedLinearNode : public NonlinearityNodeBase<ElemType>
{
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
static const std::wstring TypeName() { return L"RectifiedLinear"; }
public:
DeclareConstructorFromConfigWithNumInputs(RectifiedLinearNode);
RectifiedLinearNode(DEVICEID_TYPE deviceId, const wstring & name) :
NonlinearityNodeBase<ElemType>(deviceId, name)
{ }
void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override
{
gradient.AssignLinearRectifierDerivativeOf(inputFunctionValues);
#if DUMPOUTPUT
inputGradientValues.Print("RecitifiedLinearNode-Partial-in");
#endif
inputGradientValues.AddElementProductOf(gradientValues, gradient);
#if DUMPOUTPUT
inputGradientValues.Print("RecitifiedLinearNode-Partial-out");
#endif
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The ReLU node does not require its output value for computing
// the gradients of its input nodes
return false;
}
void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignTruncateBottomOf(inputFunctionValues, 0);
#if DUMPOUTPUT
functionValues.Print("RectifiedLinearNode");
#endif
}
};
template class RectifiedLinearNode<float>;
template class RectifiedLinearNode<double>;
// -----------------------------------------------------------------------
// SigmoidNode (input) -- sigmoid non-linearity
// -----------------------------------------------------------------------
template<class ElemType>
class SigmoidNode : public NonlinearityNodeBase<ElemType>
{
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
static const std::wstring TypeName() { return L"Sigmoid"; }
public:
DeclareConstructorFromConfigWithNumInputs(SigmoidNode);
SigmoidNode(DEVICEID_TYPE deviceId, const wstring & name) :
NonlinearityNodeBase<ElemType>(deviceId, name)
{ }
#ifdef ENABLE_TENSORVIEW
// TODO: Once tensor lib works, we will change all nodes in here to use it. Then move ForwardProp() and BackpropTo() from here into base.
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
size_t rank = DetermineElementwiseTensorRank();
auto result = ValueTensorFor(rank, fr);
auto input = Input(0)->ValueTensorFor(rank, fr);
ForwardPropV(input, result);
}
/*virtual*/ void ForwardPropV(const TensorView<ElemType>& input, TensorView<ElemType>& result) //override
{
result.AssignSigmoidOf(input);
}
virtual void /*IComputationNode::*/BeginBackprop() override // called before first iteration step of ComputeGradient()
{
m_gradientTemp->Resize(GetNumRows(), GetNumCols());
}
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
assert(inputIndex == 0); inputIndex;
// get the args
// Some do not consume input and/or output values. Don't touch those, pass dummies instead, since memshare may have taken them away already.
size_t rank = DetermineElementwiseTensorRank();
auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one...
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
auto sliceInputValue = InputUsedInComputingInputNodesGradients(0) ? Input(0)->ValueTensorFor(rank, fr) : TensorView<ElemType>();
auto sliceOutputValue = OutputUsedInComputingInputNodesGradients() ? ValueTensorFor(rank, fr) : TensorView<ElemType>();
// do the actual operation
// TODO: Once all is unified then make the order of arguments more logical (in -> out)
BackpropToV(DataTensorFor(*m_gradientTemp, rank, fr), sliceInputValue, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
}
/*virtual*/ void BackpropToV(TensorView<ElemType> gradient, const TensorView<ElemType>& inputFunctionValues, TensorView<ElemType> inputGradientValues, const TensorView<ElemType>& gradientValues, const TensorView<ElemType>& functionValues)
{
gradient.AssignSigmoidDerivativeOf(inputFunctionValues);
inputGradientValues.AddElementwiseProductOf(gradientValues, gradient);
}
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
#else
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
{
// The Sigmoid node does not require any of it's input's values for computing
// the gradients of its input nodes
UNREFERENCED_PARAMETER(childIndex);
return false;
}
#endif
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
{
gradient.AssignSigmoidDerivativeOf(functionValues);
inputGradientValues.AddElementProductOf(gradientValues, gradient);
}
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignSigmoidOf(inputFunctionValues);
}
};
template class SigmoidNode<float>;
template class SigmoidNode<double>;
// -----------------------------------------------------------------------
// TanhNode (input) -- tanh non-linearity
// -----------------------------------------------------------------------
template<class ElemType>
class TanhNode : public NonlinearityNodeBase<ElemType>
{
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
static const std::wstring TypeName() { return L"Tanh"; }
public:
DeclareConstructorFromConfigWithNumInputs(TanhNode);
TanhNode(DEVICEID_TYPE deviceId, const wstring & name) :
NonlinearityNodeBase<ElemType>(deviceId, name)
{ }
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
{
// The plus node does not require any of it's input's values for computing
// the gradients of its input nodes
UNREFERENCED_PARAMETER(childIndex);
return false;
}
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
{
gradient.AssignElementProductOf(functionValues, functionValues); // v .* v
gradient.AssignDifferenceOf(1, gradient); // 1-v^2
inputGradientValues.AddElementProductOf(gradientValues, gradient); // += d .* ((1-v) .* v))
}
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignTanhOf(inputFunctionValues);
}
};
template class TanhNode<float>;
template class TanhNode<double>;
// -----------------------------------------------------------------------
// LogNode (input) -- component-wise log() of input
// -----------------------------------------------------------------------
template<class ElemType>
class LogNode : public NonlinearityNodeBase<ElemType>
{
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
static const std::wstring TypeName() { return L"Log"; }
public:
DeclareConstructorFromConfigWithNumInputs(LogNode);
LogNode(DEVICEID_TYPE deviceId, const wstring & name) :
NonlinearityNodeBase<ElemType>(deviceId, name)
{ }
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The plus node does not require its output value for computing
// the gradients of its input nodes
return false;
}
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
{
gradient.AssignElementInverseOf(inputFunctionValues); // 1/x (x is input to log(x))
inputGradientValues.AddElementProductOf(gradientValues, gradient);
}
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignLogOf(inputFunctionValues);
}
};
template class LogNode<float>;
template class LogNode<double>;
// -----------------------------------------------------------------------
// ExpNode (input) -- component-wise exp() of input
// -----------------------------------------------------------------------
template<class ElemType>
class ExpNode : public NonlinearityNodeBase<ElemType>
{
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
static const std::wstring TypeName() { return L"Exp"; }
public:
DeclareConstructorFromConfigWithNumInputs(ExpNode);
ExpNode(DEVICEID_TYPE deviceId, const wstring & name) :
NonlinearityNodeBase<ElemType>(deviceId, name)
{ }
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
assert(inputIndex == 0); inputIndex;
Matrix<ElemType> sliceInputGrad = Input(0)->GradientFor(fr);
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
m_gradientTemp->AssignExpOf(sliceInputValue); // Exp(x) is its own partial
sliceInputGrad.AddElementProductOf(sliceOutputGrad, *m_gradientTemp);
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The ExpNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
virtual void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override { NOT_IMPLEMENTED; } // not needed
void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignExpOf(inputFunctionValues);
}
};
template class ExpNode<float>;
template class ExpNode<double>;
// -----------------------------------------------------------------------
// CosineNode (input) -- component-wise cos() of input
// -----------------------------------------------------------------------
template<class ElemType>
class CosineNode : public NonlinearityNodeBase<ElemType>
{
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
static const std::wstring TypeName() { return L"Cosine"; }
public:
DeclareConstructorFromConfigWithNumInputs(CosineNode);
CosineNode(DEVICEID_TYPE deviceId, const wstring & name) :
NonlinearityNodeBase<ElemType>(deviceId, name)
{ }
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The CosineNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
{
gradient.AssignNegativeSineOf(inputFunctionValues); // -sin(x) (x is input to Cosine(x))
inputGradientValues.AddElementProductOf(gradientValues, gradient);
}
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
{
functionValues.AssignCosineOf(inputFunctionValues);
}
};
template class CosineNode<float>;
template class CosineNode<double>;
#define UsingSoftmaxNodeBaseMembers UsingComputationNodeMembersBoilerplate; using Base::m_gradientTemp
// -----------------------------------------------------------------------
// SoftmaxNode (input) -- soft-max over input vector(s)
@ -400,14 +199,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//we assume it's column-wise by default
//the derivative will increase the Matrix<ElemType> size to the power of column size and should not be used.
template<class ElemType>
class SoftmaxNode : public NonlinearityNodeBase<ElemType>
class SoftmaxNode : public SoftmaxNodeBase<ElemType>
{
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
static const std::wstring TypeName() { return L"Softmax"; }
public:
DeclareConstructorFromConfigWithNumInputs(SoftmaxNode);
SoftmaxNode(DEVICEID_TYPE deviceId, const wstring & name) :
NonlinearityNodeBase<ElemType>(deviceId, name)
Base(deviceId, name)
{ }
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
@ -467,14 +266,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
template<class ElemType>
class LogSoftmaxNode : public NonlinearityNodeBase<ElemType>
class LogSoftmaxNode : public SoftmaxNodeBase<ElemType>
{
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
static const std::wstring TypeName() { return L"LogSoftmax"; }
public:
DeclareConstructorFromConfigWithNumInputs(LogSoftmaxNode);
LogSoftmaxNode(DEVICEID_TYPE deviceId, const wstring & name) :
NonlinearityNodeBase<ElemType>(deviceId, name)
Base(deviceId, name)
{ }
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
@ -1040,9 +839,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// this node is not differentiable and so cannot be used in the backpropagation
// TODO: make function value sparse?
template<class ElemType>
class HardmaxNode : public NonlinearityNodeBase/*ComputationNode*/<ElemType>
class HardmaxNode : public SoftmaxNodeBase/*ComputationNode*/<ElemType>
{
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
static const std::wstring TypeName() { return L"Hardmax"; }
public:

Просмотреть файл

@ -5,6 +5,11 @@
//
#pragma once
#include "Basics.h"
#include "Matrix.h"
#include "TensorShape.h"
#include "ComputationNode.h"
#include <unordered_set>
#include <map>
#include <string>
@ -18,10 +23,6 @@
#include <sstream>
#include <iostream>
#include "Basics.h"
#include "Matrix.h"
#include "ComputationNode.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
@ -86,33 +87,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
typedef std::shared_ptr<DelayedValueNodeState<ElemType>> DelayedNodeStatePtr;
static const std::wstring TypeName() { return L"DelayedValue"; }
private:
void Init(size_t row_size, size_t col_size, ElemType initialActivationValue = (ElemType)DEFAULT_HIDDEN_ACTIVATION)
void Init(const TensorShape & sampleLayout, ElemType initialActivationValue)
{
m_initialActivationValue = initialActivationValue;
m_timeStep = 1;
CreateMatrixIfNull(m_value);
SetDims(TensorShape(row_size), col_size); // TODO: needed? Can we not infer it? How about setting a sample layout?
m_isHistoryCarryOverManagedExternally = false; // used for PairNetworkNode/PastValueNode combination
SetDims(sampleLayout, 0); // TODO: needed? Can we not infer it? How about setting a sample layout?
m_isHistoryCarryOverManagedExternally = false; // used for PairNetworkNode/PastValueNode combination, which is deprecated
m_value->SetValue(m_initialActivationValue); // is this needed?
}
protected:
DelayedValueNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name),
m_delayedActivation(deviceId)
{
Init(1, 1);
Init(TensorShape(), (ElemType)DEFAULT_HIDDEN_ACTIVATION);
}
DelayedValueNodeBase(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t row_size, size_t col_size, size_t timeStep) :
DelayedValueNodeBase(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, const TensorShape & sampleLayout, size_t timeStep) :
Base(deviceId, name),
m_delayedActivation(deviceId)
{
Init(row_size, col_size, initialActivationValue);
m_timeStep = (int)timeStep;
m_value->SetValue(m_initialActivationValue);
Init(sampleLayout, initialActivationValue);
m_timeStep = (int)timeStep; // TODO: pass this to Init() instead as well
}
DelayedValueNodeBase(const ScriptableObjects::IConfigRecordPtr configp) :
DelayedValueNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"defaultHiddenActivation"), configp->Get(L"rows"), configp->Get(L"cols"), configp->Get(L"timeStep"))
DelayedValueNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"defaultHiddenActivation"), configp->Get(L"shape"), configp->Get(L"timeStep"))
{
// We do NOT attach the inputs, as we cannot resolve them without causing a circular reference.
// Instead, we capture them in a lambda, which will be called by ComputationNetwork during the build process through LateAttachInputs() below.
@ -593,8 +592,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
PastValueNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
PastValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t row_size, size_t col_size, size_t timeStep) :
Base(deviceId, name, initialActivationValue, row_size, col_size, timeStep)
PastValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, const TensorShape & sampleLayout, size_t timeStep) :
Base(deviceId, name, initialActivationValue, sampleLayout, timeStep)
{ }
PastValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t numRows, size_t timeStep) :
PastValueNode(deviceId, name, initialActivationValue, TensorShape(numRows), timeStep)
{ }
PastValueNode(const ScriptableObjects::IConfigRecordPtr configp) :
Base(configp)
@ -619,8 +621,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name) :
Base(deviceId, name)
{ }
FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t row_size, size_t col_size, size_t timeStep) :
Base(deviceId, name, initialActivationValue, row_size, col_size, timeStep)
FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, const TensorShape & sampleLayout, size_t timeStep) :
Base(deviceId, name, initialActivationValue, sampleLayout, timeStep)
{ }
FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t numRows, size_t timeStep) :
FutureValueNode(deviceId, name, initialActivationValue, TensorShape(numRows), timeStep)
{ }
FutureValueNode(const ScriptableObjects::IConfigRecordPtr configp) :
Base(configp)

Просмотреть файл

@ -126,8 +126,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#define UsingReinterpretNodeBaseMembers UsingComputationNodeMembersBoilerplate
// TODO: This ReshapeNode is currently not used. Its function will be taken over by Transpose and the Reshape that follows this one below.
// -----------------------------------------------------------------------
// ReshapeNode (input) -- reinterpret input matrix as having different dimensions
// DeprecatedReshapeNode (input) -- reinterpret input matrix as having different dimensions
// where the new row dimension is given, and the column dimension is inferred.
// Also optionally associate a different TensorShape with the data.
//
@ -149,7 +151,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// E.g. ReinterpretRowStackAsSequence and ReinterpretSequenceAsRowStack.
// BUGBUG: This is not actually implemented yet. Instead, it goes from 1 to K steps or from K to 1 step. This is temporary/experimental, until the plumbing for nesting is there.
//
// Thirdly, ReshapeNode can also be used to update only the TensorShape. In that case, the MBLayout is kept as is.
// Thirdly, DeprecatedReshapeNode can also be used to update only the TensorShape. In that case, the MBLayout is kept as is.
//
// Note: The new row dimension must be a straight multiple or divisor of the current row dimension.
// To reshape to a non-multiple go to row dim 1 first.
@ -159,19 +161,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
template<class ElemType>
class ReshapeNode : public ReinterpretNodeBase<ElemType>
class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
{
typedef ReinterpretNodeBase<ElemType> Base; UsingReinterpretNodeBaseMembers;
static const std::wstring TypeName() { return L"Reshape"; }
static const std::wstring TypeName() { return L"DeprecatedReshape"; }
public:
ReshapeNode(DEVICEID_TYPE deviceId, const wstring & name, size_t numRows = 0, const TensorShape & imageLayout = TensorShape()) :
DeprecatedReshapeNode(DEVICEID_TYPE deviceId, const wstring & name, size_t numRows = 0, const TensorShape & imageLayout = TensorShape()) :
Base(deviceId, name),
m_numTargetRows(numRows),
m_targetImageLayout(imageLayout)
{ }
ReshapeNode(const ScriptableObjects::IConfigRecordPtr configp) :
ReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"numRows"), ImageLayoutWHC(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels")))
DeprecatedReshapeNode(const ScriptableObjects::IConfigRecordPtr configp) :
DeprecatedReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"numRows"), ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKind::HWC/*legacy*/))
{
// BUGBUG: We should not operate on image layouts here, but on a proper tensor layout.
AttachInputs(configp, this->GetExpectedNumInputs());
}
@ -180,7 +183,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Base::CopyTo(nodeP, newName, flags);
if (flags & CopyNodeFlags::copyNodeValue)
{
auto node = dynamic_pointer_cast<ReshapeNode<ElemType>>(nodeP);
auto node = dynamic_pointer_cast<DeprecatedReshapeNode<ElemType>>(nodeP);
node->m_numTargetRows = m_numTargetRows;
node->m_targetImageLayout = m_targetImageLayout;
}
@ -197,7 +200,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
Base::Load(fstream, modelVersion);
fstream >> m_numTargetRows;
m_targetImageLayout.Load(fstream);
m_targetImageLayout.Load(fstream, /*acceptLegacyFormat=*/true);
}
virtual void /*IComputationNode::*/PrintSelfBeforeValidation() const override
@ -214,7 +217,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
else
fprintf(stderr, "%ls[%lu, %lu]", child->NodeName().c_str(), child->GetNumRows(), child->GetNumCols());
}
fprintf(stderr, ", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout.GetWidth(), m_targetImageLayout.GetHeight(), m_targetImageLayout.GetNumChannels());
fprintf(stderr, ", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout[1], m_targetImageLayout[2], m_targetImageLayout[0]);
// BUGBUG: This interpretaion as image dims is only correct for the 'legacy format, not for cudnn.
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
@ -247,7 +251,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// setting any dimension to 0 means lose the tensor, flatten to vector
// TODO: We can use 0 to indicate "infer". One value can be 0. It will be filled in to match row dim.
if (m_targetImageLayout.GetWidth() == 0 || m_targetImageLayout.GetHeight() == 0 || m_targetImageLayout.GetNumChannels() == 0)
if (m_targetImageLayout[1] == 0 || m_targetImageLayout[2] == 0 || m_targetImageLayout[0] == 0)
{
if (Input(0)->HasSampleLayout())
fprintf(stderr, "WARNING: Reshape operation cannot inherit image size information from its child. Image size info is lost.\n");
@ -257,7 +261,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
else
{
if (m_numTargetRows != m_targetImageLayout.GetNumElements())
LogicError("ReshapeNode: InferTargetSampleLayout() computed a sample layout [%s] that mismatches m_numTargetRows %d.", string(m_targetImageLayout).c_str(), (int)m_numTargetRows);
LogicError("DeprecatedReshapeNode: InferTargetSampleLayout() computed a sample layout [%s] that mismatches m_numTargetRows %d.", string(m_targetImageLayout).c_str(), (int)m_numTargetRows);
SetDims(m_targetImageLayout, newCols);
}
}
@ -289,7 +293,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
// going from many samples to one: layout entry will get no flags
if (Input(0)->GetNumTimeSteps() * Input(0)->GetNumRows() / m_numTargetRows != 1)
LogicError("ReshapeNode::BeginForwardProp() faking to remove a nested time dimension only works when going back to a single frame per sequence.");
LogicError("DeprecatedReshapeNode::BeginForwardProp() faking to remove a nested time dimension only works when going back to a single frame per sequence.");
// we are in frame mode now
m_pMBLayout->InitAsFrameMode(Input(0)->GetNumParallelSequences());
}
@ -297,7 +301,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
// going from one sample to many: layout will get SentenceStart/SentenceEnd flags for the sequence we expand into
if (Input(0)->GetMBLayout()->GetNumTimeSteps() != 1)
LogicError("ReshapeNode::BeginForwardProp() faking to add a nested time dimension only works when coming from a single frame per sequence.");
LogicError("DeprecatedReshapeNode::BeginForwardProp() faking to add a nested time dimension only works when coming from a single frame per sequence.");
m_pMBLayout->Init(Input(0)->GetNumParallelSequences(), Input(0)->GetNumTimeSteps() * Input(0)->GetNumRows() / m_numTargetRows);
for (size_t s = 0; s < m_pMBLayout->GetNumParallelSequences(); s++)
m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, s, 0, m_pMBLayout->GetNumTimeSteps());
@ -325,7 +329,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// layout case: reshape semantics happens across parallel seqeunces, i.e. requiring data shuffling
else
{
// TODO: It does not make sense to run ReshapeNode frame-by-frame inside a loop, because it changes the time base.
// TODO: It does not make sense to run DeprecatedReshapeNode frame-by-frame inside a loop, because it changes the time base.
// However, in the future, we should be able to run inside an outer loop.
if (!fr.IsAllFrames())
InvalidArgument("%ls %ls operation cannot be run from inside a loop since it changes the time base.", NodeName().c_str(), OperationName().c_str());
@ -358,14 +362,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The ReshapeNode does not require its output value for computing
// The DeprecatedReshapeNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
{
// The ReshapeNode does not require any of it's input's values for computing
// The DeprecatedReshapeNode does not require any of it's input's values for computing
// the gradients of its input nodes
UNREFERENCED_PARAMETER(childIndex);
return false;
@ -377,35 +381,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t factor() const { return m_numTargetRows > Input(0)->GetNumRows() ? m_numTargetRows / Input(0)->GetNumRows() : Input(0)->GetNumRows() / m_numTargetRows; } // factor by which we stack or unstack
TensorShape m_targetImageLayout;
// this patches up m_targetImageLayout according to some rules
// TODO: Say in one sentence what this logic does.
// This infers dimensions in m_targetImageLayout.
// Users are allowed to provide 2 (out of 3) image dimensions.
// One missing dimension can be inferred. If two dimensions are
// unspecified it throws a runtime error.
// TODO: Generalize this to any number of dimensions.
void InferTargetSampleLayout()
{
if (m_targetImageLayout.GetWidth() > 0)
// BUGBUG: Below is the result of refactoring and only works for rank-3 tensors. Generalize.
if (m_targetImageLayout[1] > 0)
{
if (m_targetImageLayout.GetHeight() > 0)
if (m_targetImageLayout[2] > 0)
{
if (m_targetImageLayout.GetNumChannels() > 0)
if (m_targetImageLayout[0] > 0)
{
if (m_targetImageLayout.GetNumElements() != m_numTargetRows)
RuntimeError("Image dimensions do not match row size.");
}
else
{
if (m_numTargetRows % (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetHeight()) > 0)
if (m_numTargetRows % (m_targetImageLayout[1] * m_targetImageLayout[2]) > 0)
RuntimeError("Image row size is not a multiple of specified image dimensions.");
else
m_targetImageLayout = ImageLayoutWHC(m_targetImageLayout.GetWidth(), m_targetImageLayout.GetHeight(), m_numTargetRows / (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetHeight()));
m_targetImageLayout = TensorShape(m_numTargetRows / (m_targetImageLayout[1] * m_targetImageLayout[2]), m_targetImageLayout[1], m_targetImageLayout[2]);
}
}
else
{
if (m_targetImageLayout.GetNumChannels() > 0)
if (m_targetImageLayout[0] > 0)
{
if (m_numTargetRows % (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetNumChannels()) > 0)
if (m_numTargetRows % (m_targetImageLayout[1] * m_targetImageLayout[0]) > 0)
RuntimeError("Image row size is not a multiple of specified image dimensions.");
else
m_targetImageLayout = ImageLayoutWHC(m_targetImageLayout.GetWidth(), m_numTargetRows / (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetNumChannels()), m_targetImageLayout.GetNumChannels());
m_targetImageLayout = TensorShape(m_targetImageLayout[0], m_targetImageLayout[1], m_numTargetRows / (m_targetImageLayout[1] * m_targetImageLayout[0]));
}
else
{
@ -415,26 +423,173 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
else
{
if (m_targetImageLayout.GetHeight() > 0)
if (m_targetImageLayout[2] > 0)
{
if (m_targetImageLayout.GetNumChannels() > 0)
if (m_targetImageLayout[0] > 0)
{
if (m_numTargetRows % (m_targetImageLayout.GetHeight() * m_targetImageLayout.GetNumChannels()) > 0)
if (m_numTargetRows % (m_targetImageLayout[2] * m_targetImageLayout[0]) > 0)
RuntimeError("Image row size is not a multiple of specified image dimensions.");
else
m_targetImageLayout = ImageLayoutWHC(m_numTargetRows / (m_targetImageLayout.GetHeight() * m_targetImageLayout.GetNumChannels()), m_targetImageLayout.GetHeight(), m_targetImageLayout.GetNumChannels());
m_targetImageLayout = TensorShape(m_targetImageLayout[0], m_numTargetRows / (m_targetImageLayout[2] * m_targetImageLayout[0]), m_targetImageLayout[2]);
}
else
RuntimeError("At least two image dimensions must be specified.");
}
else if (m_targetImageLayout.GetNumChannels() > 0)
else if (m_targetImageLayout[0] > 0)
RuntimeError("At least two image dimensions must be specified.");
else
m_targetImageLayout = ImageLayoutWHC(m_numTargetRows, 1, 1);
m_targetImageLayout = TensorShape(1, m_numTargetRows, 1);
}
}
};
template class DeprecatedReshapeNode<float>;
template class DeprecatedReshapeNode<double>;
// -----------------------------------------------------------------------
// Reshape(x, tensorShape, beginDim=0, endDim=0) -- reinterpret input samples as having different tensor dimensions
// - just replaces metadata m_sampleLayout, does not change data values
// - one dimension may be specified as 0 and will be inferred
// - optional beginDim/endDim denote to only replace a sub-range of dims, for implementing ReshapeDimension() and FlattenRank()
// - may not be applied to time; use Permute() or Transpose()
//
// Derived operations:
//
// ReshapeDimension(x, dim, tensorShape) = Reshape(x, tensorShape, beginDim=dim, endDim=dim+1)
// - reinterprets one dimension as multiple, where the number of elements remains the same
// - one of the new dimensions may be specified as 0 and will be inferred
//
// FlattenDimensions(x, dim, num) = Reshape(x, 0, beginDim=dim, endDim=dim+num)
// - replace two or more consecutive dims by a single dim with the same number of elements
//
// SplitDimension(x, dim, N) = ReshapeDimension(x, dim, 0:N)
// - splits a dimension into a new tensor dimension, injecting them into a new dimension
// - to split stacked frames into a new time dimension:
// insert new time dim with ReshapeDimension(., -1, 0:1), SplitDimension(., dim, N), Transpose(., dim+1, -1), then Select(., dim+1, 0) away the new time dim
// This would make 4 copies presently. We may need a compound C++ node for now.
// - note: to split into multiple outputs (like tf.split()), use a BrainScript loop with Slice().
// -----------------------------------------------------------------------
template<class ElemType>
class ReshapeNode : public UnaryElementWiseNode<ElemType>
{
typedef UnaryElementWiseNode<ElemType> Base; UsingUnaryElementwiseNodeBaseMembers;
static const std::wstring TypeName() { return L"Reshape"; }
public:
ReshapeNode(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & replacementSampleLayout = TensorShape(), int beginDim = 1, int endDim = 0) :
Base(deviceId, name),
m_replacementSampleLayout(replacementSampleLayout), m_beginDimParameter(beginDim), m_endDimParameter(endDim)
{ }
ReshapeNode(const ScriptableObjects::IConfigRecordPtr configp) :
ReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"shape"), configp->Get(L"beginDim"), configp->Get(L"endDim"))
{
AttachInputs(configp, this->GetExpectedNumInputs());
}
virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
{
Base::CopyTo(nodeP, newName, flags);
if (flags & CopyNodeFlags::copyNodeValue)
{
auto node = dynamic_pointer_cast<ReshapeNode<ElemType>>(nodeP);
node->m_replacementSampleLayout = m_replacementSampleLayout;
}
}
virtual void Save(File& fstream) const override
{
Base::Save(fstream);
fstream << m_beginDimParameter << m_endDimParameter;
m_replacementSampleLayout.Save(fstream);
}
virtual void Load(File& fstream, size_t modelVersion) override
{
Base::Load(fstream, modelVersion);
fstream >> m_beginDimParameter >> m_endDimParameter;
m_replacementSampleLayout.Load(fstream);
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
// BUGBUG: For inputs without MBLayout, the sample layout should include the column dimension, but it does not currently. Needs to be fleshed out.
const auto & inputSampleLayout = Input(0)->GetSampleLayout();
const auto & inputDims = inputSampleLayout.GetDims();
auto replacementDims = m_replacementSampleLayout.GetDims();
size_t beginDim = m_beginDimParameter > 0 ? m_beginDimParameter - 1 : 0;
size_t endDim = m_endDimParameter > 0 ? m_endDimParameter - 1 : inputDims.size();
if (!isFinalValidationPass) // non-final: be tolerant, no errors
{
if (endDim > inputDims.size())
endDim = inputDims.size();
if (beginDim > endDim)
beginDim = endDim;
}
// TODO: We should allow to reduce to a 0-length tensor if the dimension is 0
// if a dimension is specified as zero then infer it, otherwise verify that total #elements matches
size_t inputElements = 1; // get #elements in range to be replaced
for (size_t k = beginDim; k < endDim; k++)
inputElements *= inputDims[k];
size_t targetElements = 1; // check/infer #elements to replace with
size_t zeroIndex = SIZE_MAX;
for (size_t k = 0; k < replacementDims.size(); k++)
{
if (replacementDims[k] != 0)
targetElements *= replacementDims[k];
else if (zeroIndex == SIZE_MAX)
zeroIndex = k;
else
InvalidArgument("%ls %ls operation: More than one dimension was specified as zero in the replacement (sub-)dimensions [%s]", NodeName().c_str(), OperationName().c_str(), string(m_replacementSampleLayout).c_str());
}
if (zeroIndex != SIZE_MAX)
replacementDims[zeroIndex] = inputElements / targetElements; // infer the number (ignore errors at this point)
// assemble actual full dimension vector
SmallVector<size_t> dims;
dims.append(inputDims.begin(), inputDims.begin() + beginDim);
dims.append(replacementDims.begin(), replacementDims.end());
dims.append(inputDims.begin() + endDim, inputDims.end());
auto sampleLayout = TensorShape(dims);
// validate total dimension
if (isFinalValidationPass && inputSampleLayout.GetNumElements() != sampleLayout.GetNumElements())
{
auto subShape = TensorShape(std::vector<size_t>(inputDims.begin() + beginDim, inputDims.begin() + endDim));
InvalidArgument("%ls %ls operation: Input (sub-)dimensions [%s] incompatible with desired (sub-)dimensions [%s]. Number of elements %s.",
NodeName().c_str(), OperationName().c_str(),
string(subShape).c_str(), string(m_replacementSampleLayout).c_str(),
zeroIndex == SIZE_MAX ? "must be the same" : "is not an integer multiple of the non-0 dimensions");
}
// that's it
SetDims(sampleLayout, 0); // BUGBUG: This is incorrect if we have no MBLayout, e.g. reshaping a bias vector into a different tensor dimension
}
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
{
ValueFor(fr).SetValue(Input(0)->ValueFor(fr));
}
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
{
Input(inputIndex)->GradientFor(fr).SetValue(GradientFor(fr));
}
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
private:
TensorShape m_replacementSampleLayout; // user-specified dimensions to replace dimensions [beginDim, endDim]
int m_beginDimParameter; // 1-based index range as specified
int m_endDimParameter;
};
template class ReshapeNode<float>;
template class ReshapeNode<double>;
@ -811,4 +966,196 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template class RowRepeatNode<float>;
template class RowRepeatNode<double>;
/*
notes on tensor operations
==========================
reshaping
---------
- on dimension index 'dim' and 'tensorShape'
- tensorShape: a vector of dimensions, e.g. 640:480:3:30 could describe a 1-second RGB video of VGA dimensions at 30 fps
- 'dim' specifies a specific tensor index
- dim > 0 is a regular sample index. E.g. for a matrix, dim=1 would be the row dimension, and dim=2 in the above example has dimension 480.
- dim < 0 denote time indices (recurrent loops). Rank=-1 is the innermost time index.
- dim = 0 denotes the index of the parallel sequence
- Since all operations logically operate on a single sequence, i.e. parallel sequences generally cannot be indexed by the user.
- Exceptions: training criteria, BatchNormalization, ...WithNegativeSamples (we should not need this)
- I don't like that 'dim' refers to the index of the dimension as well as the number of elements in that dimension. Axis (numpy)?
- Reshaping: --these are all implemented in C++ by DeprecatedReshapeNode
- Reshape(x, tensorShape, beginDim=0, endDim=0)
- just replaces metadata m_sampleLayout
- one dimension may be specified as 0 and will be inferred
- optional beginDim/endDim denote to only replace a sub-range of dims, for implementing ReshapeDimension() and FlattenRank()
- may not be applied to time; use Permute() or Transpose()
- ReshapeDimension(x, dim, tensorShape) = Reshape(x, tensorShape, beginDim=dim, endDim=dim+1)
- reinterprets one dimension as multiple, where the number of elements remains the same
- one of the new dimensions may be specified as 0 and will be inferred
- FlattenDimensions(x, dim, num) = Reshape(x, 0, beginDim=dim, endDim=dim+1)
- replace two or more consecutive dims by a single dim with the same number of elements
- SplitDimension(x, dim, N) = ReshapeDimension(x, dim, 0:N)
- splits a dimension into a new tensor dimension, injecting them into a new dimension
- to split stacked frames into a new time dimension:
insert new time dim with ReshapeDimension(., -1, 0:1), SplitDimension(., dim, N), Transpose(., dim+1, -1), then Select(., dim+1, 0) away the new time dim
This would make 4 copies presently. We may need a compound C++ node for now.
- note: to split into multiple outputs (like tf.split()), use a BrainScript loop with Slice().
- Slicing --all implemented in C++ by SliceNode
- Slice(x, dim, begin, end, stride=1, phase=0)
- reduces a dim to index range [begin,end)
- negative bounds specify "from end" (end=0 means end if stride>0, and begin=0 means end if stride<0)
- also applies to time, e.g.:
- pick last frame of a sequence (for s2s): Slice(x, -1, -1, 0) // first -1 is dim and means the time index
- trim first and last 3 frames of a sequence: Slice(x, -1, 3, -3) // 3 means begin at frame 3, -3 means end is 3rd frame from the end
- this will update MBLayout
- the optional stride and phase parameters are for implementing downsampling (stride>1) and reversing (begin=-1, stride=-1)
- multiple slice operations can be combined by concatenating the spec vector, e.g. Slice(x, dim1:dim2, begin1:begin2, end1:end2)
- today's RowSlice(begin, num, x) = Slice(x, 1, begin, begin + num)
- like torch.narrow()
- can implement TF unpack() and Torch split() as a BrainScript loop with multiple Slice() operations
- internally implemented by tensor lib opCopy with manipulated m_strides/m_offset
- Select(x, dim, index) = FlattenDimensions(Slice(x, dim, index, index+1), index > 1 ? index-1 : index, index > 1 ? index : index+1)
- narrow dim to a single index, then drop the dim. Result will have one dim less.
- like torch.select()
- can implement squeezing a dim-1 dim: Select(x, dim:0)
- Squeeze(x, dim) = Select(x, dim, 0)
- Splicing: --all implemented in C++ by SpliceNode
- Splice(inputs, dim)
- splice multiple inputs inputs[0]:inputs[1]:... along given dim (=RowStack for vectors)
- inputs must have identical dimensions except for:
- the specified dim
- broadcasting dimensions (e.g. used to implement Pad())
- one can splice in time
- e.g. prepend a vector to a time sequence
- this will create a new MBLayout
- like tf.concat()
- Pack(inputs, dim) = ReshapeDimension(Splice(inputs, dim), dim, (0:Length(inputs)) )
- like splice but creates inserts new dim of dimension Length(inputs)
- inputs must have identical dimensions for all dims (except for broadcasting)
- dim can be a time dimension; then a new inner-most time dimension will be inserted
- like tf.pack()
- Pad(x, dim, howManyBefore, howManyAfter, with=0) = Splice(Constant(with, tensorShape=1*(dim-1):howManyBefore), x, Constant(with, tensorShape=1*(dim-1):howManyAfter), dim)
- inverse of slice, pad with a constant value
- dimensions specified relative, can pad at start and end
- in time: pad neighbor frames
- Repeat(x, dim, numRepeats) = Splice(x*numRepeats, dim)
- generalizes CNTK RowRepeat(x, numRepeats) = Repeat(x, 1, numRepeats)
- to repeat multiple, specify vectors, e.g. Repeat(x, dim1:dim2, numRepeats1:numRepeats2)
- like tf.tile() and Matlab's repmat()
- Transposition (permuting dims): --implemented in C++ by PermuteDimensionsNode
- PermuteDimensionsOf(x, dim1:dim2:...:dimN)
- dims are rotated to dim2:dim3:...:dimN:dim1; other dims remain untouched
To rotate the other way round, specify them in opposite order.
We specify it this way to be able to reference the time dimension without having to know the rank of the m_sampleLayout.
- time dims must have a constant duration for all items in the minibatch
- internally implemented with tensor lib by shuffling dimensions with their strides --TODO: check if TensorShape optimization is still correct
- Transpose(x, dim1, dim2) = PermuteDimensions(x, dim1:dim2)
- any two dimensions; including time (must have constant duration)
- like torch.transpose()
- Re-indexing: --implemented by ReindexRankNode and SliceNode
- ReindexDimension(x, dim, indexVector)
- splice x[..., indexVector[0], ...], x[..., indexVector[1], ...], etc. with indexVector[.] at given dim
- indexVector must be invertible if it is intended to backpropagate through this node
- DownsampleDimension(x, dim, n, phase=0) = Slice(x, dim, 0, 0, stride=n)
- select every n-th element, starting with index 'phase'
- time dims allowed. Phase is then a modulus w.r.t. where a sequence is inside the minibatch (may require a ReconcileLayout() before to match layouts)
- ReverseDimension(x, dim) = Slice(x, dim, -1, 0, stride=-1)
- reverses the direction of a dim
- when applied to time dims, this creates a new layout (which is also flipped)
- misc.:
- note: much would look more natural if we had OO syntax, e.g. x.Slice(dim, begin, end).FlattenDimensions(...)
Could be done by exposing all methods on ComputationNode... not currently feasible with BrainScript, but e.g. with Python bindings
- torch.unfold (dim, size, step)
- create a convolution matrix (stride magic)
- CyclicallyPermuteRank(x, dim, step)
- rotates indices
- also applies to time dimensions
- duplicate elements
- Gather
- from Torch and TF
- TF also has:
- 'gather': reindexing
- 'dynamic_partition', 'dynamic_stitch'
- Torch:
- expand (dim, range): broadcasts dimension 'dim' as a new dimension with 'range'. Not needed I think.
- repeatTensor: like tile but with weird reshaping
- squeeze: removes all singleton dimensions, or a specific one. We can remove a specific one with Select().
- TODO:
- give names to dimensions?
- do we want to allow time offsets in layouts?
reductions
----------
- ReduceSum
- sum over all elements of a dimension, or over time
- ReduceMax
- max
- ReduceMean
- av
- ArgMax, ArgMin
- we already have that somewhere, for evaluation
- All, Any
- logical test --must be done over sequences
- TF also has:
- reduce_prod, reduce_min
- segment_sum etc.; we use sequences
- listdiff
- where: indices of 'true' values -> 2D tensor of coordinates
- unique (1D only)
- edit_distance
- invert_permutation: invert a permutation index vector
- top_k
convolutions
------------
- convolution
- convolution with filter
- max pool (=convolution with weights 1 and max reduction)
- av pool (=convolution with uniform filter)
- also in time: by specifying more filter dimensions [TODO]
- tricky bit: boundaries; may need expansion or reduction of sequences
element-wise operations
-----------------------
- PlusNode, MinusNode, ElementTimes
- with broadcasting, these implement:
- PlusNode with bias, PlusNode for images
- 1-x
- ScaleNode, RowElementTimes, ColumnElementTimes
- elementwise nonlinearities as usual [TODO: complete them]
- logical ops (can be done by comparison ops actually)
- Clamp
- bounds are passed as 'Const'
- TF: in_top_k
- Torch performs these ops (e.g. add) as vector, without broadcasting
- e.g. max reduces, while cmax does not. Our solution is better... really? How to specify reduce?
gradient operations
-------------------
- TF: are nodes, e.g. clip_by_value
- input should be parameters as well, so they can be computed
- need a node to stop gradient propagation?
- can we use nodes to specify things like AdaGrad and momentum?
debugging
---------
- node that prints activations
- node that prints mean/var of gradients
other
-----
- per-node learning rate: can specify additional parameter for each node? Maybe fold with updateLearnableParameter?
- give dimensions a name?
- can we interleave variable-length ones? Concat into a single dimensions, using strides?
*/
}}}

Просмотреть файл

@ -1367,6 +1367,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
RequestMatrixFromPool(m_softmaxOfRight, matrixPool);
RequestMatrixFromPool(m_gammaFromLattice, matrixPool);
}
// Release gradient and temp matrices that are no longer needed after all the children's gradients are computed.
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
ReleaseMatrixToPool(m_gammaFromLattice, matrixPool);
}
// TODO: method names should be CamelCase
std::vector<shared_ptr<const msra::dbn::latticepair>> * getLatticePtr()
{

Просмотреть файл

@ -39,7 +39,7 @@
MATH_API DEVICEID_TYPE EnforceOneGPUOnly(DEVICEID_TYPE requestedDeviceId);
namespace Microsoft { namespace MSR { namespace CNTK {
namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// ElementWiseOperator -- This enum represents which function to apply.
@ -48,41 +48,52 @@ namespace Microsoft { namespace MSR { namespace CNTK {
enum ElementWiseOperator
{
// nullary
opConstOne,
// unary (or binary with constant parameter)
opCopy,
opNegate, opNot,
opAbs,
opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine,
// these are not implemented yet:
opSaturateBetaAlpha, opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha,
opSigmoid, opTanh, opSqrt, opExp, opLog, opLinearRectifier, opCosine,
// unary ops for use by Matrix class only (there is no TensorView implementation)
opSigmoidDerivative, opLinearRectifierDerivative, opNegativeSine,
// binary
opSum, opDifference, opElementwiseProduct, opElementwiseQuotient,
opLogSum, opMax, opMin,
opEQ, opNE, opGT, opLT, opGE, opLE,
opAnd, opOr, opXor,
opMaskNegative,
opElementwiseProductWithSigmoidDerivativeFromOutput, opElementwiseProductWithTanhDerivativeFromOutput,
opElementwiseProductWithLinearRectifierDerivativeFromOutput, opElementwiseProductWithLogDerivativeFromOutput, opElementwiseProductWithCosDerivative,
// binary ops for indexing
//opIndex,
// ternary
opCond
// Note: not all of the above are actually implement at present; and not all that's implemented has an opcode.
opCond/*a ? b : c*/, opClip/*clip a within interval b..c*/
// Note: not all that's implemented in CNTK ComputationNodes has an opcode yet.
};
// helper to apply a C macro for all operations of each kind
#define ForAllNullaryOps(Macro) \
Macro(ConstOne);
#define ForAllUnaryOps(Macro) \
Macro(Copy); \
Macro(Negate); Macro(Not); \
Macro(Abs); \
Macro(Sigmoid); Macro(SigmoidDerivative); Macro(Tanh); Macro(Sqrt); Macro(Exp); Macro(Log); Macro(LinearRectifierDerivative); Macro(Cosine); Macro(NegativeSine);
#define ForAllParameterizedUnaryOps(Macro) \
Macro(SaturateBetaAlpha); Macro(SumAlpha); Macro(SubDifferenceToAlpha); Macro(SubDifferenceFromAlpha);
Macro(Sigmoid); Macro(Tanh); Macro(Sqrt); Macro(Exp); Macro(Log); Macro(LinearRectifier); Macro(Cosine);
#define ForAllBinaryOps(Macro) \
Macro(Sum); Macro(Difference); Macro(ElementwiseProduct); Macro(ElementwiseQuotient); \
Macro(LogSum); Macro(Max); Macro(Min); \
Macro(EQ); Macro(NE); Macro(GT); Macro(LT); Macro(GE); Macro(LE); \
Macro(MaskNegative);
Macro(And); Macro(Or); Macro(Xor);\
Macro(MaskNegative); \
Macro(ElementwiseProductWithSigmoidDerivativeFromOutput); Macro(ElementwiseProductWithTanhDerivativeFromOutput); \
Macro(ElementwiseProductWithLinearRectifierDerivativeFromOutput); Macro(ElementwiseProductWithLogDerivativeFromOutput); Macro(ElementwiseProductWithCosDerivative); \
//Macro(Index);
#define ForAllTernaryOps(Macro) \
Macro(Cond);
Macro(Cond); Macro(Clip);
// -----------------------------------------------------------------------
// various enums to describe

Просмотреть файл

@ -51,6 +51,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// GPU and 1-dimensional image
bool gpuSparse1D = (inT.h() == 1 &&
in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
convDesc.wStride() == 1 &&
!convDesc.padding() &&
in.GetMatrixType() == MatrixType::SPARSE);
out.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);
@ -67,8 +69,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t startSampleId = i * subBatchSize;
size_t endSampleId = min(batchSize, startSampleId + subBatchSize);
size_t smallBatchSize = endSampleId - startSampleId;
workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
Mat inputSubBatch;
// We optimize for three different scenarios here by handling them slightly differently.
@ -78,10 +78,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (in.GetMatrixType() == MatrixType::DENSE)
inputSubBatch = in.ColumnSlice(startSampleId, smallBatchSize);
else
{
inputSubBatch.SetValue(in.ColumnSlice(startSampleId, smallBatchSize), in.GetFormat());
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, true);
}
if (gpuSparse1D)
{
@ -94,6 +91,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
else
{
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, true);
workspace.AssignPackedConvolutionInput(inputSubBatch,
inT.w(), inT.h(), inT.c(),
outT.w(), outT.h(), outT.c(),
@ -101,6 +99,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
convDesc.padding());
Mat outputSubBatch = out.ColumnSlice(outputSizePerChannel * startSampleId, outputSizePerChannel * smallBatchSize);
workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
Mat::Multiply(filter, false, workspace, false, outputSubBatch);
}
}
@ -197,6 +197,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// GPU and 1-dimensional image
bool gpuSparse1D = (inT.h() == 1 &&
in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
convDesc.wStride() == 1 &&
!convDesc.padding() &&
in.GetMatrixType() == MatrixType::SPARSE);
if (numSubBatches == 1 && allowReuse && !gpuSparse1D) //reuse packed input from evaluation step if it's not changed by either subbatch or recurrent steps.
@ -209,18 +211,40 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t startSampleID = i * subBatchSize;
size_t endSampleID = min(batchSize, startSampleID + subBatchSize);
size_t smallBatchSize = endSampleID - startSampleID;
workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
Matrix<ElemType> inputSubBatch = in.ColumnSlice(startSampleID, smallBatchSize);
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, inputSubBatch.GetFormat(), true);
workspace.AssignPackedConvolutionInput(inputSubBatch,
inT.w(), inT.h(), inT.c(),
srcGradT.w(), srcGradT.h(), srcGradT.c(),
filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
convDesc.padding());
Matrix<ElemType> outputGradientSubBatch = srcGradTmp.ColumnSlice(startSampleID * outputSizePerChannel, smallBatchSize * outputSizePerChannel);
Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, filter);
// We optimize for three different scenarios here by handling them slightly differently.
// [Scenario 1] Dense: Unroll using AssignPackedConvolutionInput and multiply.
// [Scenario 2] Sparse 1-D convolution on GPU: for text scenarios we have a specific kernel.
// [Scenario 3] Sparse all others: convert to dense. Temporary work-around - allocating/de-allocating memory is costly!
if (gpuSparse1D)
{
Matrix<ElemType> inputSubBatch;
inputSubBatch.SetValue(in.ColumnSlice(startSampleID, smallBatchSize));
inputSubBatch.Reshape(inT.c(), smallBatchSize * inT.w());
Matrix<ElemType> inputSubBatchSparseReordered(inputSubBatch.GetNumCols(), inputSubBatch.GetNumRows(), inputSubBatch.GetDeviceId(), MatrixType::SPARSE, MatrixFormat::matrixFormatSparseCSC);
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, inT.w(), 1, smallBatchSize, inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);
Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * srcGradT.w(), srcGradT.c(), outputGradientSubBatch.GetDeviceId());
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, srcGradT.w(), 1, smallBatchSize, srcGradT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);
filter.Reshape(srcGradT.c() * filterT.w(), inT.c());
Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, filter, smallBatchSize, convDesc.wStride(), convDesc.padding(), false);
filter.Reshape(srcGradT.c(), inT.c() * filterT.w());
}
else
{
workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
Matrix<ElemType> inputSubBatch = in.ColumnSlice(startSampleID, smallBatchSize);
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, inputSubBatch.GetFormat(), true);
workspace.AssignPackedConvolutionInput(inputSubBatch,
inT.w(), inT.h(), inT.c(),
srcGradT.w(), srcGradT.h(), srcGradT.c(),
filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
convDesc.padding());
Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, filter);
}
}
}
@ -239,7 +263,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
assert(outT.n() == out.GetNumCols());
Mat o = out.ColumnSlice(0, out.GetNumCols());
Mat o = out.ColumnSlice(0, out.GetNumCols()); // same as .AsReference()
Mat d = dst.Reshaped(biasT.c(), outT.w() * outT.h() * outT.n());
d.AssignSumOf(o.Reshaped(biasT.c(), outT.w() * outT.h() * outT.n()), bias);
}
@ -410,23 +434,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
};
template<class ElemType>
std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, EngineType engType)
std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind)
{
if (engType == EngineType::Auto)
{
// REVIEW alexeyk: make cuDNN default when running on GPU and compiled with cuDNN, add config parameter to enable runtime switch between implementations.
if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId))
return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>();
return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>();
if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId) && imageLayoutKind == ImageLayoutKind::CHW)
return Create(deviceId, EngineType::CuDnn, imageLayoutKind);
else
return Create(deviceId, EngineType::Legacy, imageLayoutKind);
}
else if (engType == EngineType::CuDnn)
{
if (imageLayoutKind != ImageLayoutKind::CHW)
InvalidArgument("ConvolutionEngineFactory: ImageLayout '%s' is not compatible with the cuDNN engine.", ToString(imageLayoutKind).c_str());
if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId))
return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>();
RuntimeError("cuDNN convolution engine is not supported, check the device id and whether the code was compiled with cuDNN.");
}
else if (engType == EngineType::Legacy)
{
if (imageLayoutKind != ImageLayoutKind::HWC)
InvalidArgument("ConvolutionEngineFactory: ImageLayout '%s' is not compatible with the legacy convolution engine.", ToString(imageLayoutKind).c_str());
return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>();
}
RuntimeError("Not supported convolution engine type: %d.", engType);
}

Просмотреть файл

@ -18,6 +18,7 @@
#endif
#include "Matrix.h"
#include "TensorShape.h" // for ImageLayoutKind
namespace Microsoft { namespace MSR { namespace CNTK {
@ -252,7 +253,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId) = 0;
enum class EngineType { Auto, CuDnn, Legacy };
static std::unique_ptr<ConvolutionEngineFactory<ElemType>> Create(DEVICEID_TYPE deviceId, EngineType engType = EngineType::Auto);
static std::unique_ptr<ConvolutionEngineFactory<ElemType>> Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind);
public:
ConvolutionEngineFactory(const ConvolutionEngineFactory&) = delete;

Просмотреть файл

@ -10,11 +10,7 @@
#ifdef USE_CUDNN
#include <cudnn.h>
template<> const char* CudaErrString(cudnnStatus_t x)
{
return cudnnGetErrorString(x);
}
#define CUDNN_CALL(expr) (CudaCall((expr), #expr, "cuDNN", CUDNN_STATUS_SUCCESS))
template<> const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x) { return cudnnGetErrorString(x); }
// A note on the formats: CNTK originally used NHWC for input/output tensors and CHWN for filters.
// Such formats have very limited support in cuDNN and not used in other frameworks.

Просмотреть файл

@ -5,25 +5,27 @@
//
#include "stdafx.h"
#include "Basics.h"
#include "BestGpu.h"
#include "DebugUtil.h"
#ifndef CPUONLY
#include "cublas_v2.h"
#include "Basics.h"
#include "GPUMatrix.h"
#include "GPUMatrixCUDAKernels.cuh"
#include "GPUSparseMatrix.h"
#include "GPUTensor.h"
#include "CommonMatrix.h"
#define TENSOR_OPS_DECL __device__ __host__
#include "TensorOps.h"
#include "device_launch_parameters.h"
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <curand_kernel.h>
#include "cublas_v2.h"
#include <assert.h>
#include <memory>
#pragma comment (lib, "cudart.lib") // instruct linker to reference these libs
#pragma comment (lib, "cublas.lib")
@ -47,8 +49,6 @@ bool do_sync = true;
#ifdef _WIN32
// thread local storage to access the current stream, initalize to default stream
__declspec (thread)
#else
static
#endif
cudaStream_t t_stream = cudaStreamDefault;
@ -78,9 +78,9 @@ cudaStream_t MATH_API GetStream()
performElementWiseFunction(ElementWiseOperator::op##f, a.m_pArray); \
return *this; }
static const char * CudaErrString(cudaError_t x) { cudaDeviceSynchronize(); return cudaGetErrorString(x); }
static const char * CudaErrString(cublasStatus_t) { cudaDeviceSynchronize(); return "(see cublas_api.h & look for cublasStatus_t or CUBLAS_STATUS_xxx)"; }
static const char * CudaErrString(curandStatus) { cudaDeviceSynchronize(); return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)"; }
template<> const char * CudaErrString<cudaError_t>(cudaError_t x) { cudaDeviceSynchronize(); return cudaGetErrorString(x); }
template<> const char * CudaErrString<cublasStatus_t>(cublasStatus_t) { cudaDeviceSynchronize(); return "(see cublas_api.h & look for cublasStatus_t or CUBLAS_STATUS_xxx)"; }
template<> const char * CudaErrString<curandStatus>(curandStatus) { cudaDeviceSynchronize(); return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)"; }
namespace Microsoft { namespace MSR { namespace CNTK {
@ -384,7 +384,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#pragma region Constructors and Destructor
//should only be used by constructors.
// should only be used by constructors
template<class ElemType>
void GPUMatrix<ElemType>::ZeroInit(int deviceId)
{
@ -449,13 +449,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_numRows = moveFrom.m_numRows;
m_numCols = moveFrom.m_numCols;
m_computeDevice = moveFrom.m_computeDevice;
m_pArray = moveFrom.m_pArray; //shallow copy the pointer
m_pArray = moveFrom.m_pArray; // shallow copy the pointer
m_matrixName=moveFrom.m_matrixName;
m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
m_format = moveFrom.m_format;
m_externalBuffer = moveFrom.m_externalBuffer;
//release the pointer from the source object so that the destructor won't release it twice
// release the pointer from the source object so that the destructor won't release it twice
moveFrom.ZeroInit(0);
}
@ -477,10 +477,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
if (this != &moveFrom)
{
if (OwnBuffer() && m_pArray!=NULL)
{
if (OwnBuffer() && m_pArray)
CUDA_CALL(cudaFree(m_pArray));
}
m_numRows = moveFrom.m_numRows;
m_numCols = moveFrom.m_numCols;
@ -500,8 +498,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
GPUMatrix<ElemType>::~GPUMatrix(void)
{
Clear();
if (m_workspace != nullptr)
delete m_workspace;
delete m_workspace;
}
template<class ElemType>
@ -3259,6 +3256,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#pragma endregion Other helper functions
#pragma region Static BLAS Functions
// float/double overloads of cublasSgemm()/cublasDgemm()
static cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc)
{
return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
static cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc)
{
return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
template<class ElemType>
void GPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB,
ElemType beta, GPUMatrix<ElemType>& c)
@ -3278,28 +3285,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (beta == 0)
c.Resize(m,n);
else
c.VerifySize(m, n); // Can't resize if beta != 0
c.VerifySize(m, n); // Can't resize if beta != 0
if (!(m>0 && k>0 && l>0 && n>0))
{
RuntimeError("!(m>0 && k>0 && l>0 && n>0)"); //converting from size_t to int may cause overflow
}
if (k!=l)
{
RuntimeError("matrix dim mismatch in MultiplyAndWeightedAdd");
}
if (sizeof(ElemType)==sizeof(float))
{
CUBLAS_CALL(cublasSgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast<float*>(&alpha),reinterpret_cast<float*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<float*>(b.m_pArray),(int)b.m_numRows,reinterpret_cast<float*>(&beta),reinterpret_cast<float*>(c.m_pArray),(int)c.m_numRows));
}
else if (sizeof(ElemType)==sizeof(double))
{
CUBLAS_CALL(cublasDgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast<double*>(&alpha),reinterpret_cast<double*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<double*>(b.m_pArray),(int)b.m_numRows,reinterpret_cast<double*>(&beta),reinterpret_cast<double*>(c.m_pArray),(int)c.m_numRows));
}
else
{
RuntimeError("Unsupported template argument in GPUMatrix");
}
CUBLAS_CALL(cublas_gemm(cuHandle, transA, transB, m, n, k, &alpha, a.m_pArray, (int)a.m_numRows, b.m_pArray, (int)b.m_numRows, &beta, c.m_pArray, (int)c.m_numRows));
c.m_numRows=m;
c.m_numCols=n;
}
@ -4436,396 +4428,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CUDA_CALL(cudaFree(d_zeta));
};
// =======================================================================
// TensorView support
// =======================================================================
// BUGBUG: This is a stub that currently is just the CPU code. This is not functional yet.
// To save time, this makes extensive use of templates and macros.
// -----------------------------------------------------------------------
// simple fixed-size arrays for passing dimension information by value
// since CUDA can't just take our std::array and std::vector
// -----------------------------------------------------------------------
template<typename T, size_t N>
struct FixedArray
{
T m_data[N];
__device__ __host__ size_t size() const { return N; }
__device__ __host__ T & operator[](size_t n) { return m_data[n]; }
__device__ __host__ T operator[](size_t n) const { return m_data[n]; }
template<class VEC> FixedArray(const VEC & data) // construct from CPU-side STL array or vector
{
assert(data.size() == N);
for (size_t n = 0; n < N; n++)
{
m_data[n] = (T)data[n];
if (m_data[n] != data[n]) // overflow check
InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
}
}
};
template<typename T> // specialized version for 0 elements
struct FixedArray<T, 0>
{
__device__ __host__ size_t size() const { return 0; }
template<class VEC> FixedArray(const VEC & data) { assert(data.size() == 0); UNUSED(data); }
};
template<typename T, size_t N, size_t K> // N = which input/output; K = index depth
struct FixedMatrix
{
T m_data[N][K];
__device__ __host__ size_t getNumRows() const { return N; }
__device__ __host__ size_t getNumCols() const { return K; }
__device__ __host__ T & operator()(size_t n, size_t k) { return m_data[n][k]; }
__device__ __host__ T operator()(size_t n, size_t k) const { return m_data[n][k]; }
template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data) // construct from CPU-side array of vectors
{
assert(data.size() == N);
for (size_t n = 0; n < N; n++)
{
assert(data[n].size() == K);
for (size_t k = 0; k < K; k++)
{
m_data[n][k] = (T)data[n][k];
if (m_data[n][k] != data[n][k]) // overflow check
InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
}
}
}
};
template<typename T, size_t N> // specialized version for 0 elements
struct FixedMatrix<T, N, 0>
{
__device__ __host__ size_t getNumRows() const { return N; }
__device__ __host__ size_t getNumCols() const { return 0; }
template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data) { assert(data.size() == N); for (size_t n = 0; n < N; n++) assert(data[n].size() == 0); UNUSED(data); }
};
// -----------------------------------------------------------------------
// function to actually compute a function of (N-1) inputs based on the opcode
// TensorView entry points from Matrix.cpp
// -----------------------------------------------------------------------
// helper to provide a vector of ones of at least the given number of elements
// TODO: Use this to implement ComputationNode::ConstOnes? Or do we even need that anymore?
template<class ElemType>
struct TensorOps
static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE deviceId)
{
static __device__ ElemType Compute(const FixedArray<ElemType*, 2> & pointers, ElementWiseOperator op)
// using an array of shared_ptrs because those are thread-safe. The objects themselves are immutable.
// And using a plain array so this will never get freed, avoiding free-after-DLL-unload issues.
static shared_ptr<GPUMatrix<ElemType>> onesCache[32]; // cache of objects
if (deviceId >= _countof(onesCache))
LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", (int)_countof(onesCache), (int)deviceId+1);
auto p = onesCache[deviceId];
if (!p || p->GetNumRows() < N) // must (re-)allocate
{
ElemType a = *(pointers[0]);
#define CaseUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a)
switch (op)
{
ForAllUnaryOps(CaseUnaryTensorOp);
default: return 0; // (failure)
}
p = make_shared<GPUMatrix<ElemType>>(GPUMatrix<ElemType>::Ones(N, 1, deviceId));
onesCache[deviceId] = p; // this will replace the pointer thread-safely (although weird race conditions may happen where a larger entry is overwritten by a smaller one; will still run correctly)
}
static __device__ ElemType Compute(const FixedArray<ElemType*, 3> & pointers, ElementWiseOperator op)
{
ElemType a = *(pointers[0]);
ElemType b = *(pointers[1]);
#define CaseBinaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b)
switch (op)
{
ForAllBinaryOps(CaseBinaryTensorOp); // note: this costs about 6% compared to having only a single case
default: return 0; // (failure)
}
}
static __device__ ElemType Compute(const FixedArray<ElemType*, 4> & pointers, ElementWiseOperator op)
{
ElemType a = *(pointers[0]);
ElemType b = *(pointers[1]);
ElemType c = *(pointers[2]);
#define CaseTernaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b,c)
switch (op)
{
ForAllTernaryOps(CaseTernaryTensorOp);
default: return 0; // (failure)
}
}
};
// -----------------------------------------------------------------------
// function to compute the value for a given output location (perform reduction if needed)
// -----------------------------------------------------------------------
#define C_size_t CUDA_LONG
#define C_int CUDA_LONG
#define C_unsigned_int CUDA_LONG
template<class ElemType, C_size_t N, C_int M, C_int m>
struct TensorOpReduce
{
// this version for m >= 0
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
{
// start with index 0
// Using 'double' since we are memory-bound anyway.
double/*ElemType*/ aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
// apply this index to the pointers
C_size_t dim = reducingOpDims[m];
for (C_size_t k = 1/*done with k=0 already*/; k < dim; k++)
{
// bump the pointers
for (C_size_t i = 0; i < N; i++)
pointers[i] += reducingStrides(i,(C_size_t)m);
ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
aggregate += val;
}
return (ElemType)aggregate;
}
};
// this one terminates the template recursion over reduction dimensions
// The pointers are pointing to the input element.
template<class ElemType, C_size_t N, C_int M>
struct TensorOpReduce<ElemType, N, M, /*m=*/-1>
{
// this version for m = -1
// the pointers are pointing to the right location(s) to take the operation over
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
const FixedArray<C_unsigned_int, M> & /*reducingOpDims*/, const FixedMatrix<C_int, N, M> & /*reducingStrides*/)
{
return TensorOps<ElemType>::Compute(pointers, op); // finally computing something!
}
};
// -----------------------------------------------------------------------
// perform loop over regular index k for N-nary operations (N counting the output)
// -----------------------------------------------------------------------
// The canonical case, vector op without reduction, is this PTX function:
// _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi3ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
// float ^ ^ aggregate loop
// args? ^ ^ input dims
// _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi2ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
// increment a pointer by a number of elements
// This will later change into pre-scaled strides.
template<class ElemType>
static __device__ void IncPtr(ElemType * &p, C_int index, C_int stride)
{
//p = (ElemType*)(byteOffset + (char *)p);
p = p + index * stride;
return p;
}
// The 'pointers' only refer to a single element, so we will bump them in-place to perform indexing.
template<class ElemType, C_size_t N, C_int M, C_int K, C_int k>
struct TensorOpElement
{
// template-recursive version loops over indices
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
const FixedArray<C_unsigned_int, K> & regularOpStrides, const FixedMatrix<C_int, N, K> & regularStrides,
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
{
// map id (location on grid) to index[k]
C_size_t stride = regularOpStrides[(C_size_t)k];
C_size_t index = id / stride; // this dimension
id = id % stride; // remaining dimensions inside this
// apply this index to the pointers
for (C_size_t i = 0; i < N; i++)
pointers[i] += index * regularStrides(i,(C_size_t)k); // now this dimension is taken care of
// process the previous index
TensorOpElement<ElemType, N, M, K, k - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides);
}
};
// specialization for k=0 where stride is guaranteed to be 1
template<class ElemType, C_size_t N, C_int M, C_int K>
struct TensorOpElement<ElemType, N, M, K, /*k=*/0>
{
// template-recursive version loops over indices
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
const FixedArray<C_unsigned_int, K> & regularOpStrides, const FixedMatrix<C_int, N, K> & regularStrides,
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
{
// map id (location on grid) to index[k]
C_size_t index = id; // this dimension
// apply this index to the pointers
for (C_size_t i = 0; i < N; i++)
pointers[i] += index * regularStrides(i,0); // now this dimension is taken care of
// process the previous index
TensorOpElement<ElemType, N, M, K, -1>::Compute(/*id*/0, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides);
}
};
// specialization for k = -1 terminates the template recursion
template<class ElemType, C_size_t N, C_int M, C_int K>
struct TensorOpElement<ElemType, N, M, K, /*k=*/-1>
{
// template-recursion-teminating version computes the actual value for this output location
// now the pointers point to the right element
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
const FixedArray<C_unsigned_int, K> & /*regularOpStrides*/, const FixedMatrix<C_int, N, K> & /*regularStrides*/,
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
{
// compute the operation for this output coordinate
// This may still involve a reduction over inverse-broadcasting dimensions.
ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
// scale
val *= alpha;
// combine with previous value in target matrix, then write it out
auto * pout = pointers[N - 1];
if (beta != 0)
val += beta * *pout;
// save
*pout = val;
}
};
// -----------------------------------------------------------------------
// kernel and launch
// -----------------------------------------------------------------------
// the top-level kernel
template<class ElemType, C_size_t N, C_int M, C_int K>
__global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides,
FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides, CUDA_LONG numElements)
{
CUDA_LONG id = GridDim::GetLinearThreadId();
if (id >= numElements)
return;
TensorOpElement<ElemType, N, M, K, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides);
}
// launch tensor op with CUDA
// All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
template<class ElemType, C_size_t N, C_int M, C_int K>
static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
{
// copy all parameters to CUDA-compatible data structures
FixedArray<ElemType*, N> pointers(pointerVector);
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
C_size_t numElements = 1;
for (C_size_t k = 0; k < regularOpDims.size(); k++)
{
regularOpStrideVector.push_back(numElements);
numElements *= (C_size_t)regularOpDims[k];
}
FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
CUDA_LONG NN = (CUDA_LONG)numElements;
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
GridDim grid(NN);
_launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, NN);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
}
// for linear unary ops, we need to define a functor for every function for use as a template parameter (lambda syntax doesn't work in CUDA 7)
#define DefineUnaryTensorFunctor(oper) \
struct Functor ## oper { template<class ElemType> static __device__ ElemType f(ElemType a) { return Op ## oper(a); } };
ForAllUnaryOps(DefineUnaryTensorFunctor);
// the top-level kernel for linear unary ops
// Note: If we have a beta, we have 2 memory accesses, so this optimization may no longer be needed as we are memory-bound.
template<class ElemType, class FN>
__global__ void _launchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, CUDA_LONG numElements)
{
CUDA_LONG id = GridDim::GetLinearThreadId();
if (id >= numElements)
return;
ElemType a = pa[id];
ElemType val = FN::f(a);
val *= alpha;
if (beta != 0)
val += beta * pb[id];
pb[id] = val;
}
// version without beta and alpha
template<class ElemType, class FN>
__global__ void _launchUnaryTensorOp(const ElemType * pa, ElemType * pb, CUDA_LONG numElements)
{
CUDA_LONG id = GridDim::GetLinearThreadId();
if (id >= numElements)
return;
ElemType a = pa[id];
ElemType val = FN::f(a);
pb[id] = val;
}
// special case of linear unary operation
template<class ElemType>
static void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim)
{
CUDA_LONG NN = (CUDA_LONG)regularOpDim;
#define CaseLaunchUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: \
if (beta == 0 && alpha == 1) \
return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(pa, pb, NN); \
else \
return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pa, pb, alpha, NN);
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
GridDim grid(NN);
switch (op)
{
ForAllUnaryOps(CaseLaunchUnaryTensorOp);
default: LogicError("LaunchTensorOp1: Unknown op code %d.", (int)op);
}
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
}
// -----------------------------------------------------------------------
// map runtime parameters N to template parameters
// -----------------------------------------------------------------------
// tensor operation with k+1 dimensions (-1 means scalar)
template<class ElemType, C_size_t N, C_int K>
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrides,
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
{
size_t dims = reducingOpDims.size();
switch (dims)
{
case 2: return LaunchTensorOp<ElemType, N, 2, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1: return LaunchTensorOp<ElemType, N, 1, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0: return LaunchTensorOp<ElemType, N, 0, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (C_int)dims);
}
}
// tensor operation, generalized in number of arguments
// This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
template<class ElemType, C_size_t N>
static void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
const array<size_t, N> & offsets,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrides,
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
{
for (C_size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
pointers[i] += offsets[i];
size_t dims = regularOpDims.size();
switch (dims)
{
case 4: return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 3: return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 2: return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1: return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0: return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (C_int)dims);
}
}
// -----------------------------------------------------------------------
// entry points from Matrix.cpp
// -----------------------------------------------------------------------
// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This binds the N-ariness to a template parameter N, and gets the data pointers out from the matrix objects.
template<class ElemType>
@ -4844,6 +4469,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0)
return LaunchUnaryTensorOp<ElemType>(beta, a.m_pArray + offsets[0], m_pArray + offsets[1], alpha, op, regularOpDims[0]);
// special case: recuding a matrix onto a column vector; can be done with SGEMM
// Note: A minor risk is that with this, our own reduction function will rarely be used.
// That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation
#ifdef _DEBUG
sizeof(ElemType) == sizeof(float) && // in debug don't shortcut 'double' so we have some test of our own codepath
#endif
regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && // we are processing a column
reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t)regularOpDims[0]) // reducing across columns and no overlap
{
assert(reducingStrides[1][0] == 0);
auto ARows = regularOpDims[0]; // vertical steps
auto ACols = reducingOpDims[0]; // horizontal steps (reduction)
auto ALd = reducingStrides[0][0]; // horizontal step width through matrix
cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
CUBLAS_CALL(cublas_gemm(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, (int)/*CRows=*/ARows, /*CCols=*/1, (int)ACols, &alpha,
/*A00=*/a.m_pArray + offsets[0], (int)ALd,
/*B00=*/GetOnesVector<ElemType>(ACols, a.GetComputeDeviceId())->m_pArray, (int)/*BRows=*/ACols, &beta,
/*C00=*/m_pArray + offsets[1], (int)/*CRows=*/ARows));
return;
}
// TODO: Add a special case for tensor bias reduction. cudnn is ~7% faster on Image/QuickE2E.
// regular case
else
return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2> { a.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
@ -4859,6 +4508,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
a.PrepareDevice();
if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId())
InvalidArgument("All matrices must be on the same GPU");
return TensorOpN<ElemType, 3>(beta, array<ElemType*, 3> { a.m_pArray, b.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
@ -4875,7 +4525,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return TensorOpN<ElemType, 4>(beta, array<ElemType*, 4> { a.m_pArray, b.m_pArray, c.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
// =======================================================================
// explicit instantiations business
// =======================================================================
@ -4886,10 +4535,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template class DeviceBoundNumber<double>;
template<class ElemType>
cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus]={0};
cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus] = { 0 };
template<class ElemType>
void* GPUMatrix<ElemType>::s_curandGenerator=NULL;
void* GPUMatrix<ElemType>::s_curandGenerator = NULL;
// We use Matrix<char> as the backing store for QuantizedMatrix
// Let's explicitly instantiate the methods we need for that purpose

Просмотреть файл

@ -9,7 +9,7 @@
#include "File.h"
#include "Helpers.h"
#include "CommonMatrix.h"
#include "DataTensor.h" // only for SmallVector; I was hoping to keep this out
#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
#include "DebugUtil.h"
#include "BestGpu.h" // for CPUONLY macro
#include "ConcStack.h"
@ -47,9 +47,7 @@ typedef struct CUstream_st *cudaStream_t;
void MATH_API SetStream(cudaStream_t stream);
cudaStream_t MATH_API GetStream();
namespace Microsoft {
namespace MSR {
namespace CNTK {
namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// DeviceBoundNumber -- This class represents a number which resides on a particular device. Use it to avoid unnecessary transfers between CPU and GPU
@ -506,7 +504,7 @@ namespace Microsoft {
}}}
// Error handling
template<typename ERRTYPE> static const char * CudaErrString(ERRTYPE x);
template<typename ERRTYPE> const char * CudaErrString(ERRTYPE x); // actual error function is defined inside .cu files
template<typename ERRTYPE> static void CudaCall(ERRTYPE retCode, const char * exprString, const char * libName, ERRTYPE successCode)
{
if (retCode != successCode)
@ -523,7 +521,9 @@ template<typename ERRTYPE> static void CudaCall(ERRTYPE retCode, const char * ex
}
}
}
#define CUDA_CALL(expr) (CudaCall((expr), #expr, "CUDA", cudaSuccess))
#define CUBLAS_CALL(expr) (CudaCall((expr), #expr, "CUBLAS", CUBLAS_STATUS_SUCCESS))
#define CUSPARSE_CALL(expr) (CudaCall((expr), #expr, "CUSPARSE", CUSPARSE_STATUS_SUCCESS))
#define CURAND_CALL(expr) (CudaCall((expr), #expr, "CURAND", CURAND_STATUS_SUCCESS))
#define CUDNN_CALL(expr) (CudaCall((expr), #expr, "cuDNN", CUDNN_STATUS_SUCCESS))

Просмотреть файл

@ -4,15 +4,22 @@
// </copyright>
//
#pragma once
#include "BestGpu.h"
#ifndef CPUONLY
#include <float.h>
#include <cuda_runtime.h>
#pragma push_macro("TENSOR_OPS_DECL")
#define TENSOR_OPS_DECL __device__ __host__
#include "CommonMatrix.h"
#include "GPUMatrix.h"
#include "TensorOps.h" // for exp_() etc.
#include "device_functions.h"
#include <cuda_runtime.h>
#include <assert.h>
#include <float.h>
#pragma pop_macro("TENSOR_OPS_DECL")
// REVIEW alexeyk: disable warnings properly for GCC/clang
#ifdef _MSC_VER
@ -36,38 +43,116 @@
#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
// CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
static __inline__ __device__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
// TODO: replace this with TensorOps.h LogAdd(). It differs in using ElemType throughout, while this one seems to use 'double' versions of exp() and log().
// The 'k' in the name is to avoid naming conflicts with various versions of logadd() that are defined throughout the codebase.
template<class ElemType>
static inline __device__ __host__ ElemType logaddk(ElemType x, ElemType y)
{
ElemType temp, diff, z;
if (x < y)
{
temp = x; x = y; y = temp;
}
diff = y - x;
if (diff < MINLOGEXP)
{
return (x < LSMALL) ? LZERO : x;
}
else
{
z = exp(diff);
return x + log(1.0 + z);
}
}
namespace Microsoft { namespace MSR { namespace CNTK {
// ---------------------------------------------------------------------------
// GridDim -- helper to choose the CUDA grid dimensions
// ---------------------------------------------------------------------------
// TODO: move the computation of 'id' here as well
template<class INT, class INT2>
static INT CeilDiv(INT a, INT2 b) // ceil(a/b)
{
return (INT)(((size_t)a + (size_t)b - 1) / (size_t)b); // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[])
}
struct GridDim
{
static const CUDA_LONG maxThreadsPerBlock = 512; // use this many threads per block
static const CUDA_LONG minBlocksPerGrid = 48; // use at least that many blocks --TODO: base this on actual hardware
static const CUDA_LONG maxWarpsPerBlock = 16; // use this many warps per block
// use these for launching
// GridDim grid(NN);
// kernel<<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, ...>>>(...)
int m_blocksPerGrid, m_threadsPerBlock; // (these may in the future be extended to multi-dimensional ones)
CUDA_LONG m_N;
GridDim(CUDA_LONG N) // linear grid
{
m_N = N;
if (N == 0) // CUDA will fail to launch with 0 blocks
N = 1;
m_threadsPerBlock = GridDim::maxThreadsPerBlock;
m_blocksPerGrid = (N + m_threadsPerBlock - 1) / m_threadsPerBlock;
if (m_blocksPerGrid < minBlocksPerGrid)
// get device information
const auto & props = GetDeviceProps();
CUDA_LONG numProcs = props.multiProcessorCount;
CUDA_LONG warpSize = props.warpSize;
// distribute warps evenly over processors
CUDA_LONG warpsPerProc = CeilDiv(N, numProcs * warpSize);
// if too many warps per block then reduce #warps
if (warpsPerProc > maxWarpsPerBlock)
{
// we cannot fill all blocks -> use less threads
m_threadsPerBlock = (N + minBlocksPerGrid - 1) / minBlocksPerGrid;
// round to multiples of 32 (warp size) for efficient memory access
m_threadsPerBlock = (m_threadsPerBlock + 31) / 32 * 32;
m_blocksPerGrid = (N + m_threadsPerBlock - 1) / m_threadsPerBlock;
CUDA_LONG overBy = CeilDiv(warpsPerProc, maxWarpsPerBlock); // we are over by this factor
warpsPerProc = CeilDiv(warpsPerProc, overBy);
}
// put it back together
m_threadsPerBlock = warpsPerProc * warpSize;
m_blocksPerGrid = CeilDiv(N, m_threadsPerBlock);
if (m_blocksPerGrid == 1)
m_threadsPerBlock = N; // don't launch more than necessary --TODO: Does this make a difference at all?
assert(m_blocksPerGrid * m_threadsPerBlock >= N);
}
static std::vector<cudaDeviceProp> CacheDeviceProps()
{
int numDevices;
CUDA_CALL(cudaGetDeviceCount(&numDevices));
std::vector<cudaDeviceProp> props(numDevices);
for (int i = 0; i < numDevices; i++)
CUDA_CALL(cudaGetDeviceProperties(&props[i], i));
#if 1 // on Linux, maxGridSize[0] gets reported as 0
for (int i = 0; i < numDevices; i++)
fprintf(stderr, "%d procs %d warps %d %d %d max grid on %s\n", (int)props[i].multiProcessorCount, (int)props[i].warpSize, (int)props[i].maxGridSize[0], (int)props[i].maxGridSize[1], (int)props[i].maxGridSize[2], props[i].name);
#endif
return props;
}
// get device properties of current device
static const cudaDeviceProp & GetDeviceProps()
{
static std::vector<cudaDeviceProp> props = CacheDeviceProps(); // thread-safe according to C++ standard
int deviceId;
cudaGetDevice(&deviceId);
return props[deviceId];
}
// compute our location on the grid
static __device__ CUDA_LONG GetLinearThreadId()
{
@ -83,9 +168,6 @@ struct GridDim
#define UNUSED_FUNCTION_ATTRIBUTE
#endif
// Predefine this for later.
static __inline__ __device__ double atomicAdd(double* address, double val) UNUSED_FUNCTION_ATTRIBUTE;
// ===========================================================================
// CUDA kernels follow, lots of them
// ===========================================================================
@ -97,18 +179,6 @@ static __inline__ __device__ double atomicAdd(double* address, double val) UNUSE
// (ElemenType *res, CUDA_LONG N), a pointer and length of the output block. Each thread computes a function
// of the inputs for one value in the output.
// This macro overloads _x() with float and double arguments, and inlines the correct library function. This simplifies templated kernel code.
// TODO: merge with similar definition in TensorOps.h
#define DEF_ELEMENT_PRIMITIVE(x) __device__ __forceinline__ float _##x(float f) { return x##f(f); } __device__ __forceinline__ double _##x(double f) { return x(f); }
DEF_ELEMENT_PRIMITIVE(exp)
DEF_ELEMENT_PRIMITIVE(log)
DEF_ELEMENT_PRIMITIVE(tanh)
DEF_ELEMENT_PRIMITIVE(sqrt)
DEF_ELEMENT_PRIMITIVE(fabs)
DEF_ELEMENT_PRIMITIVE(cos)
DEF_ELEMENT_PRIMITIVE(sin)
template<class ElemType>
__global__ void _elementWisePowerOnCuda(
const ElemType alpha,
@ -147,6 +217,7 @@ __global__ void _elementWisePowerOnCuda(
};
// Note that this code is inefficient on CUDA due to diverging code paths.
// Use Sigmoid() in TensorOps.h instead, which solves this problem.
template<class ElemType>
__global__ void _elementWiseSigmoidOnCuda(
const ElemType *a,
@ -159,12 +230,12 @@ __global__ void _elementWiseSigmoidOnCuda(
#else
if (a[id] >= 0)
{
ElemType e = _exp(-a[id]);
ElemType e = exp_(-a[id]);
res[id] = 1 / (1 + e);
}
else
{
ElemType e = _exp(a[id]);
ElemType e = exp_(a[id]);
res[id] = e / (1 + e);
}
#endif
@ -186,7 +257,7 @@ __global__ void _assignSigmoidOf(
res[id] = Microsoft::MSR::CNTK::Sigmoid(a[id]);
#else
ElemType negElem = -a[id];
ElemType e = _exp(negElem);
ElemType e = exp_(negElem);
res[id] = 1 / (e + 1);
#endif
@ -219,7 +290,7 @@ __global__ void _elementWiseTanhOnCuda(
const CUDA_LONG N)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
res[id] = _tanh(a[id]);
res[id] = tanh_(a[id]);
};
//to prevent negative values caused by floating operations, we force inputs to be >=0
@ -231,7 +302,7 @@ __global__ void _elementWiseSqrtOnCuda(
const CUDA_LONG N)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
res[id] = _sqrt(max((ElemType)0, a[id]));
res[id] = sqrt_(max((ElemType)0, a[id]));
};
template<class ElemType>
@ -241,7 +312,7 @@ __global__ void _elementWiseExpOnCuda(
const CUDA_LONG N)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
res[id] = _exp(a[id]);
res[id] = exp_(a[id]);
};
template<class ElemType>
@ -251,7 +322,7 @@ __global__ void _elementWiseLogOnCuda(
const CUDA_LONG N)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
res[id] = (a[id] < EPS_IN_LOG) ? LOG_OF_EPS_IN_LOG : _log(a[id]);
res[id] = (a[id] < EPS_IN_LOG) ? LOG_OF_EPS_IN_LOG : log_(a[id]);
};
template<class ElemType>
@ -261,7 +332,7 @@ __global__ void _elementWiseAbsOnCuda(
const CUDA_LONG N)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
res[id] = _fabs(a[id]);
res[id] = fabs_(a[id]);
};
template<class ElemType>
@ -271,7 +342,7 @@ __global__ void _elementWiseCosineOnCuda(
const CUDA_LONG N)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
res[id] = _cos(a[id]);
res[id] = cos_(a[id]);
};
template<class ElemType>
@ -281,7 +352,7 @@ __global__ void _elementWiseNegativeSineOnCuda(
const CUDA_LONG N)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
res[id] = -_sin(a[id]);
res[id] = -sin_(a[id]);
};
template<class ElemType>
@ -1210,42 +1281,60 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
ElemType* cnzValues, //target nz values
GPUSPARSE_INDEX_TYPE* cRowIndex,
GPUSPARSE_INDEX_TYPE* cColCSCIndex,
size_t D, size_t S, size_t M, size_t K, size_t T)
size_t D, size_t S, size_t M, size_t K, size_t T,
size_t nz)
{
CUDA_LONG col = blockDim.x * blockIdx.x + threadIdx.x; // input tensor of dimension (D x S x M x K x T)
if (col >= T)
CUDA_LONG N = blockDim.x * blockIdx.x + threadIdx.x; // input tensor of dimension (D x S x M x K x T)
if (N >= nz || N < aColCSCIndex[0])
return;
size_t N = D * S * M * K;
size_t col;
for (col = 0; col < T; col++)
{
if (aColCSCIndex[col + 1] > N)
break;
}
size_t na = aRowIndex[N];
int start = aColCSCIndex[col];
int end = aColCSCIndex[col + 1];
int current = start;
for (size_t nc = 0; nc < N; nc++)
// recover the 5 indices from the loop counter
size_t d = (na ) % D;
size_t s = (na / D ) % S;
size_t m = (na / D / S ) % M;
size_t k = (na / D / S / M ) % K;
// compute index for the a and b/c tensors
size_t nc = ((s * M + m) * K + k) * D + d; // output tensor of dimension (D x K x M x S): k/K and s/S swapped
int rowIdx = start;
for (size_t na_i = start; na_i < end; na_i++)
{
// recover the 5 indices from the loop counter
size_t d = (nc ) % D;
size_t s = (nc / D ) % S;
size_t m = (nc / D / S ) % M;
size_t k = (nc / D / S / M ) % K;
size_t d_i = (na_i ) % D;
size_t s_i = (na_i / D ) % S;
size_t m_i = (na_i / D / S ) % M;
size_t k_i = (na_i / D / S / M ) % K;
// compute index for the a and b/c tensors
size_t na = ((s * M + m) * K + k) * D + d; // output tensor of dimension (D x K x M x S): k/K and s/S swapped
for (size_t j = start; j < end; j++)
size_t nc_i = ((s_i * M + m_i) * K + k_i) * D + d_i; // output tensor of dimension (D x K x M x S): k/K and s/S swapped
if (nc_i < nc)
{
if (aRowIndex[j] == na)
{
cnzValues[current] = anzValues[j];
cRowIndex[current] = nc;
current++;
break;
}
rowIdx++;
}
}
cColCSCIndex[col] = start;
cColCSCIndex[col + 1] = end;
cnzValues[rowIdx] = anzValues[N];
cRowIndex[rowIdx] = nc;
if (N == nz - 1)
{
for (int i = 0; i <= T; i++)
{
cColCSCIndex[i] = aColCSCIndex[i];
}
}
}
template<class ElemType>
@ -2688,25 +2777,82 @@ __global__ void _sparseCSRElemMulDense(
}
}
template<class ElemType>
__global__ void _isValid(
const GPUSPARSE_INDEX_TYPE* rowIndex,
const GPUSPARSE_INDEX_TYPE* colCSCIndex,
const int rows,
const int cols,
const int nz,
long* d_res
)
{
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= cols)
return;
int start = colCSCIndex[id];
int end = colCSCIndex[id + 1];
d_res[0] = 1;
if (start > end)
{
d_res[0] = -1;
d_res[1] = start;
d_res[2] = end;
}
else if (end > nz)
{
d_res[0] = -2;
d_res[1] = end;
d_res[2] = nz;
}
else
{
for (int j = start; j < end; j++) //j points to the value
{
if (rowIndex[j] > rows)
{
d_res[0] = -3;
d_res[1] = rowIndex[j];
d_res[2] = rows;
break;
}
}
}
}
template<class ElemType>
__global__ void _shiftColCSCIndexFromSliceViewToAbsolute(
GPUSPARSE_INDEX_TYPE* colCSCIndex,
const int cols
)
{
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= cols)
return;
colCSCIndex[id] = colCSCIndex[id] - colCSCIndex[0];
}
//c = alpha * op(a) * op(b) + beta*c
// TODO: This function can be further improved by loading the kernel in shared memory
template<class ElemType>
__global__ void _dense1DConvMultSparseCSCAndWeightedAddToDense(
int m, // rowDense
int k, // colDense
int n, // colSparse
int numChannels, // input num channels
int numSteps, // convolution num steps
int horizontalSubsample,// convolution step size
bool channelwise, // pixelwise for normal multiplication and channelwise for convolution operation
ElemType alpha,
const int m, // rowDense
const int k, // colDense
const int n, // colSparse
const int numChannels, // input num channels
const int numSteps, // convolution num steps
const int horizontalSubsample,// convolution step size
const bool channelwise, // pixelwise for normal multiplication and channelwise for convolution operation
const ElemType alpha,
const ElemType* a, //dense
bool transposeA,
const bool transposeA,
const ElemType* bnzValues, //sparse nz values
const GPUSPARSE_INDEX_TYPE* rowIndex,
const GPUSPARSE_INDEX_TYPE* colCSCIndex,
ElemType beta,
const ElemType beta,
ElemType* c //dense target
)
{
@ -2828,15 +2974,15 @@ __global__ void _reshape(
int currentCol = id;
int oldColLower = (newNumRows * currentCol) / oldNumRows;
int oldColUpper = (newNumRows * (currentCol + 1)) / oldNumRows;
// initialize to the end and then scan in the right direction in the for-loop
int currentColStart = oldColumnIndex[oldNumCols];
for (int oldCol = oldColLower; oldCol <= min(oldColUpper, oldNumCols); oldCol++)
for (int oldCol = oldColLower; oldCol <= oldNumCols; oldCol++)
{
int start = oldColumnIndex[oldCol];
int end = (oldCol < oldNumCols) ? oldColumnIndex[oldCol + 1] : oldColumnIndex[oldNumCols] + 1;
bool done = false;
for (int j = start; j < end; j++) //j points to the value
{
@ -2845,11 +2991,21 @@ __global__ void _reshape(
int newCol = index / newNumRows;
int newRow = index % newNumRows;
newRowIndex[j] = newRow;
if (newCol == currentCol)
newRowIndex[j] = newRow;
if (newCol >= currentCol && currentColStart > j)
currentColStart = j;
if (newCol > currentCol)
{
done = true;
break;
}
}
if (done)
break;
}
newColumnIndex[currentCol] = currentColStart;
@ -3423,7 +3579,7 @@ __global__ void _assignNoiseContrastiveEstimation(
if (positive)
prob = -prob;
ElemType score_noise = log_num_noise_samples + prob;
ElemType z = logadd(tmp[i], score_noise);
ElemType z = logaddk(tmp[i], score_noise);
ElemType logprob = tmp[i] - z;
ElemType logprob_noise = score_noise - z;
tmp[i] = -exp(logprob);
@ -3715,40 +3871,6 @@ __global__ void _normalGradForSparseBlock(
lhsValues[index] = rhs[IDX2C(row, col, numRows)];
}
static __inline__ __device__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
template<class ElemType>
static __inline__ __device__ ElemType logadd(ElemType x, ElemType y)
{
ElemType temp, diff, z;
if (x < y)
{
temp = x; x = y; y = temp;
}
diff = y - x;
if (diff < MINLOGEXP)
{
return (x < LSMALL)?LZERO:x;
}
else
{
z = exp(diff);
return x + log(1.0 + z);
}
}
//This function should be called with 1024 threads per block and 1 block
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
template<class ElemType>
@ -4513,7 +4635,7 @@ __global__ void _rcrfBackwardCompute(
fSum = LZERO;
for (int j = 0; j < iNumLab; j++)
{
fSum = logadd(fSum, alpha[IDX2C(j, t, iNumLab)]);
fSum = logaddk(fSum, alpha[IDX2C(j, t, iNumLab)]);
}
fTmp = alpha[IDX2C(id, t, iNumLab)] - fSum;
@ -4525,10 +4647,10 @@ __global__ void _rcrfBackwardCompute(
fSum = LZERO;
for (int m = 0; m < iNumLab; m++)
{
fSum = logadd(fSum, alpha[IDX2C(m, t, iNumLab)] + pair_scores[IDX2C(j, m, iNumLab)]);
fSum = logaddk(fSum, alpha[IDX2C(m, t, iNumLab)] + pair_scores[IDX2C(j, m, iNumLab)]);
}
fTmp = logadd(fTmp, beta[IDX2C(j, t + 1, iNumLab)] + alpha[IDX2C(id, t, iNumLab)] + pair_scores[IDX2C(j, id, iNumLab)] - fSum);
fTmp = logaddk(fTmp, beta[IDX2C(j, t + 1, iNumLab)] + alpha[IDX2C(id, t, iNumLab)] + pair_scores[IDX2C(j, id, iNumLab)] - fSum);
}
}
@ -4589,7 +4711,7 @@ __global__ void _rcrfBackwardCompute(
{
for (int j = 0; j < iNumLab; j++)
{
fTmp = logadd(fTmp, beta_t1[j] + alpha[id] + pair_scores[j] - zeta[j]);
fTmp = logaddk(fTmp, beta_t1[j] + alpha[id] + pair_scores[j] - zeta[j]);
}
}
@ -4630,9 +4752,9 @@ __global__ void _rcrfBackwardComputeZeta(
for (int m = 0; m < iNumLab; m++)
{
if (t == iNumPos - 1)
fSum = logadd(fSum, alpha[IDX2C(m, 0, iNumLab)]);
fSum = logaddk(fSum, alpha[IDX2C(m, 0, iNumLab)]);
else
fSum = logadd(fSum, alpha[IDX2C(m, 0, iNumLab)] + pair_scores[m]);
fSum = logaddk(fSum, alpha[IDX2C(m, 0, iNumLab)] + pair_scores[m]);
}
gzeta[id] = fSum;
@ -4684,7 +4806,7 @@ __global__ void _rcrfTransGrdComputeZeta(
else
fTmp = alpha[m];
fSum = logadd(fSum, pair_scores[m] + fTmp);
fSum = logaddk(fSum, pair_scores[m] + fTmp);
}
gzeta[id] = fSum;
@ -4787,7 +4909,7 @@ __global__ void _reductionLogAddSum(
{
ElemType lSum = LZERO;
if (tid < s){
lSum = logadd(partialLogAddSum[tid], partialLogAddSum[tid + s]);
lSum = logaddk(partialLogAddSum[tid], partialLogAddSum[tid + s]);
partialLogAddSum[tid] = lSum;
}
}
@ -4912,4 +5034,6 @@ __global__ void _maskColumnsValue(ElemType *a, const char *columnsMask, CUDA_LON
}
}
}}}
#endif // !CPUONLY

Просмотреть файл

@ -34,11 +34,7 @@ static
#endif
cudaStream_t t_stream;
// support for CudaCall() function template
static const char * CudaErrString(cudaError_t x) { cudaDeviceSynchronize(); return cudaGetErrorString(x); }
static const char * CudaErrString(cublasStatus_t) { cudaDeviceSynchronize(); return "(see cublas_api.h & look for cublasStatus_t or CUBLAS_STATUS_xxx)"; }
static const char * CudaErrString(cusparseStatus_t) { cudaDeviceSynchronize(); return "(see cusparse.h & look for cusparseStatus_t or CUSPARSE_STATUS_xxx)"; }
template<> const char * CudaErrString<cusparseStatus_t>(cusparseStatus_t) { cudaDeviceSynchronize(); return "(see cusparse.h & look for cusparseStatus_t or CUSPARSE_STATUS_xxx)"; }
namespace Microsoft { namespace MSR { namespace CNTK {
@ -137,14 +133,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ChangeDeviceTo(deepCopy.m_computeDevice);
deepCopy.PrepareDevice();
Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.m_elemSizeAllocated, deepCopy.m_format, true, false);
Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.GetNumNZElements(), deepCopy.m_format, true, false);
m_nz = deepCopy.m_nz;
m_sliceViewOffset = 0; // reset to zero as we only start copying starting from the offset in the source matrix
m_sliceViewOffset = 0; // reset to zero as we only start copying the indices starting from the offset in the source matrix
CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToDevice));
CUDA_CALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
CUDA_CALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocationWithSliceViewOffset(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
CUDA_CALL(cudaMemcpy(SecondaryIndexLocation(), deepCopy.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));
if (deepCopy.m_sliceViewOffset > 0)
{
int blocksPerGrid = (int)ceil(1.0*SecondaryIndexCount() / GridDim::maxThreadsPerBlock);
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_shiftColCSCIndexFromSliceViewToAbsolute<ElemType> << < blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream >> > (
SecondaryIndexLocation(),
SecondaryIndexCount()
);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
}
m_externalBuffer = false;
SetMatrixName(deepCopy.m_matrixName);
@ -1002,7 +1013,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA,
const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
{
if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId()))
RuntimeError("GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd: All matrices must be on the same GPU");
@ -1133,7 +1144,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
c.PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
CUDA_LONG N = (CUDA_LONG)c.GetNumCols();
CUDA_LONG N = (CUDA_LONG)c.GetNumNZElements();
int blocksPerGrid = (int)ceil(1.0*N / GridDim::maxThreadsPerBlock);
_tensorShuffleScaleAndAddRowSparse<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream >> >(
reinterpret_cast<const ElemType*>(a.BufferPointer()), // source nz values
@ -1142,7 +1153,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
reinterpret_cast<ElemType*>(c.BufferPointer()), // target nz values
c.RowLocation(),
c.ColLocation(),
D, S, M, K, T);
D, S, M, K, T,
c.GetNumNZElements());
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1936,6 +1948,37 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return GPUSparseMatrix<ElemType>::InnerProductOfMatrices(b,a);
}
template<class ElemType>
bool GPUSparseMatrix<ElemType>::IsValid() const
{
if (m_format != MatrixFormat::matrixFormatSparseCSC)
NOT_IMPLEMENTED;
PrepareDevice();
long *res = new long[3];
res[0] = 1;
res[1] = 0;
res[2] = 0;
long *d_res = nullptr;
CUDA_CALL(cudaMalloc((void**)&d_res, sizeof(long) * 3));
CUDA_CALL(cudaMemcpy(d_res, res, sizeof(long) * 3, cudaMemcpyHostToDevice));
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
int blocksPerGrid = (int)ceil((1.0*SecondaryIndexSize()) / GridDim::maxThreadsPerBlock);
_isValid<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(MajorIndexLocation(), SecondaryIndexLocation(), GetNumRows(), GetNumCols(), GetNumElemAllocated(), d_res);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
CUDA_CALL(cudaMemcpy(res, d_res, sizeof(long) * 3, cudaMemcpyDeviceToHost));
if (res[0] == 1)
return true;
else
return false;
}
template<class ElemType>
bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
const ElemType threshold)

Просмотреть файл

@ -73,18 +73,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Special Note: for the matrix may be a read-only column slice view of another
// matrix (only supported for CSC format today) and hence the NzValues needs
// to be offset accordingly.
inline const ElemType* NzValues() const { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(m_sliceViewOffset); }
inline ElemType* NzValues() { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(m_sliceViewOffset); }
inline const ElemType* NzValues() const { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(0); }
inline ElemType* NzValues() { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(0); }
inline size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use
GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const //row/col ids in CSC/CSR format, blockId2col/blockId2row in BlockCol/BlockRow format
{
return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated);
}
}
GPUSPARSE_INDEX_TYPE* MajorIndexLocationWithSliceViewOffset() const
{
return (MajorIndexLocation() + (m_format == matrixFormatSparseCSC ? SecondaryIndexValueAt(0) : 0));
}
size_t MajorIndexCount() const
{
return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format);
return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format);
}
size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const
{
@ -98,7 +103,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t MajorIndexSize() const // actual number of major index bytes in use
{
return sizeof(GPUSPARSE_INDEX_TYPE)*MajorIndexCount();
}
}
GPUSPARSE_INDEX_TYPE* SecondaryIndexLocation() const //compressed index, col/row in CSC/CSR format, col2blockId/row2blockId in BlockCol/BlockRow format
{
@ -239,6 +244,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void ConvertToSparseFormat(MatrixFormat newFormat);
void ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const;
bool IsValid() const;
public:
GPUSparseMatrix<ElemType>& ElementInverse ();
GPUSparseMatrix<ElemType>& AssignElementInverseOf (const GPUSparseMatrix<ElemType>& a);
@ -290,7 +297,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const bool transposeB, GPUSparseMatrix<ElemType>& c);
static void ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& c);
static void ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs,
const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise);
const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise);
static void TensorShuffleScaleAndAdd(ElemType keepWeight, const GPUSparseMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c);
void NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum);

693
Source/Math/GPUTensor.cu Normal file
Просмотреть файл

@ -0,0 +1,693 @@
//
// <copyright file="GPUMatrix.cu" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
#include "stdafx.h"
#include "Basics.h"
#include "BestGpu.h"
#ifndef CPUONLY
#include "GPUTensor.h"
#include "GPUMatrix.h"
#include "GPUMatrixCUDAKernels.cuh"
#include "CommonMatrix.h"
#define TENSOR_OPS_DECL __device__ __host__
#include "TensorOps.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <assert.h>
#ifndef let
#define let const auto
#endif
#pragma comment (lib, "cudart.lib") // instruct linker to reference these libs
#pragma comment (lib, "cublas.lib")
#pragma warning (disable: 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<<a,b>>> syntax if a and b are size_t
#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
#pragma warning (disable: 4702) // unreachable code; triggered for unknown reasons
extern bool do_sync;
#ifdef _WIN32
// thread local storage to access the current stream, initalize to default stream
__declspec (thread)
#endif
extern cudaStream_t t_stream;
namespace Microsoft { namespace MSR { namespace CNTK {
// =======================================================================
// TensorView support
// =======================================================================
// To save time, this makes extensive use of templates and macros.
// -----------------------------------------------------------------------
// simple fixed-size arrays for passing dimension information by value
// since CUDA can't just take our std::array and std::vector
// -----------------------------------------------------------------------
template<typename T, size_t N>
struct FixedArray
{
T m_data[N];
__device__ __host__ size_t size() const { return N; }
__device__ __host__ T & operator[](size_t n) { return m_data[n]; }
__device__ __host__ T operator[](size_t n) const { return m_data[n]; }
template<class VEC> FixedArray(const VEC & data) // construct from CPU-side STL array or vector
{
assert(data.size() == N);
for (size_t n = 0; n < N; n++)
{
m_data[n] = (T)data[n];
if (m_data[n] != data[n]) // overflow check
InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
}
}
};
template<typename T> // specialized version for 0 elements
struct FixedArray<T, 0>
{
__device__ __host__ size_t size() const { return 0; }
template<class VEC> FixedArray(const VEC & data) { assert(data.size() == 0); UNUSED(data); }
FixedArray() { }
};
template<typename T, size_t N, size_t K> // N = which input/output; K = index depth
struct FixedMatrix
{
T m_data[N][K];
__device__ __host__ size_t getNumRows() const { return N; }
__device__ __host__ size_t getNumCols() const { return K; }
__device__ __host__ T & operator()(size_t n, size_t k) { return m_data[n][k]; }
__device__ __host__ T operator()(size_t n, size_t k) const { return m_data[n][k]; }
template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data) // construct from CPU-side array of vectors
{
assert(data.size() == N);
for (size_t n = 0; n < N; n++)
{
assert(data[n].size() == K);
for (size_t k = 0; k < K; k++)
{
m_data[n][k] = (T)data[n][k];
if (m_data[n][k] != data[n][k]) // overflow check
InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
}
}
}
};
template<typename T, size_t N> // specialized version for 0 elements
struct FixedMatrix<T, N, 0>
{
__device__ __host__ size_t getNumRows() const { return N; }
__device__ __host__ size_t getNumCols() const { return 0; }
template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data) { assert(data.size() == N); for (size_t n = 0; n < N; n++) assert(data[n].size() == 0); UNUSED(data); }
FixedMatrix() { }
};
// -----------------------------------------------------------------------
// function to actually compute a function of (N-1) inputs based on the opcode
// -----------------------------------------------------------------------
template<class ElemType>
struct TensorOps
{
static __device__ ElemType Compute(const FixedArray<ElemType*, 1> & pointers, ElementWiseOperator op)
{
#define CaseNullaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper<ElemType>()
switch (op)
{
ForAllNullaryOps(CaseNullaryTensorOp);
default: return OpConstOne<ElemType>(); // (failure--we only have one nullary op, so use the same, maybe it will eliminate the switch altogether)
}
}
static __device__ ElemType Compute(const FixedArray<ElemType*, 2> & pointers, ElementWiseOperator op)
{
ElemType a = *(pointers[0]);
#define CaseUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a)
switch (op)
{
ForAllUnaryOps(CaseUnaryTensorOp);
default: return 0; // (failure)
}
}
static __device__ ElemType Compute(const FixedArray<ElemType*, 3> & pointers, ElementWiseOperator op)
{
//const ElemType & a = *(pointers[0]); // const & for opIndex--costs quite some code bloat
ElemType a = *(pointers[0]);
ElemType b = *(pointers[1]);
#define CaseBinaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b)
switch (op)
{
ForAllBinaryOps(CaseBinaryTensorOp); // note: this costs about 6% compared to having only a single case
default: return 0; // (failure)
}
}
static __device__ ElemType Compute(const FixedArray<ElemType*, 4> & pointers, ElementWiseOperator op)
{
ElemType a = *(pointers[0]);
ElemType b = *(pointers[1]);
ElemType c = *(pointers[2]);
#define CaseTernaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b,c)
switch (op)
{
ForAllTernaryOps(CaseTernaryTensorOp);
default: return 0; // (failure)
}
}
};
// -----------------------------------------------------------------------
// function to compute the value for a given output location (this version performs reduction if needed)
// -----------------------------------------------------------------------
//#define ReduceElemType double
#define ReduceElemType ElemType
template<class ElemType, C_size_t N, C_int M, C_int m>
struct TensorOpReduce
{
// this version for m >= 0
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
{
// start with index 0
// We may use 'double' since we are memory-bound anyway.
ReduceElemType aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
// apply this index to the pointers
C_size_t dim = reducingOpDims[m];
for (C_size_t k = 1/*done with k=0 already*/; k < dim; k++)
{
// bump the pointers
for (C_size_t i = 0; i < N - 1; i++) // N-1 because output is not used here
pointers[i] += reducingStrides(i,(C_size_t)m);
ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
aggregate += val;
}
return (ElemType)aggregate;
}
};
// this one terminates the template recursion over reduction dimensions
// The pointers are pointing to the input element.
template<class ElemType, C_size_t N, C_int M>
struct TensorOpReduce<ElemType, N, M, /*m=*/-1>
{
// this version for m = -1
// the pointers are pointing to the right location(s) to take the operation over
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
const FixedArray<C_unsigned_int, M> & /*reducingOpDims*/, const FixedMatrix<C_int, N, M> & /*reducingStrides*/)
{
return TensorOps<ElemType>::Compute(pointers, op); // finally computing something!
}
};
// -----------------------------------------------------------------------
// function to compute one constituent of the value for a given output location (this version has reduction done outside)
// -----------------------------------------------------------------------
template<class ElemType, C_size_t N, C_int M, C_int m>
struct TensorOpParallelReduce
{
// this version for m >= 0
static __device__ ElemType Compute(CUDA_LONG id, FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
{
// map id (location on grid) to index[k]
C_size_t stride = 1; // compute the stride. This seems expensive, but since we we only currently support M <= 2, this is just compile-time selection between 1 and reducingOpDims[0].
for (int i = 0; i < m; i++)
stride *= reducingOpDims[(C_size_t)i];
C_size_t index = id / stride; // this dimension. For m=0, the stride is 1 and hence the division will be removed at compile time.
id = id % stride; // remaining dimensions inside this. For m=0 this value is ignored and hence not even computed.
// apply this index to the pointers
for (C_size_t i = 0; i < N - 1; i++)
pointers[i] += index * reducingStrides(i, (C_size_t)m); // now this dimension is taken care of
return TensorOpParallelReduce<ElemType, N, M, m - 1>::Compute(id, pointers, op, reducingOpDims, reducingStrides);
}
};
// this one terminates the template recursion over reduction dimensions
// The pointers are pointing to the input element.
template<class ElemType, C_size_t N, C_int M>
struct TensorOpParallelReduce<ElemType, N, M, /*m=*/-1>
{
// this version for m = -1
// the pointers are pointing to the right location(s) to take the operation over
static __device__ ElemType Compute(CUDA_LONG /*id*/, FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
const FixedArray<C_unsigned_int, M> & /*reducingOpDims*/, const FixedMatrix<C_int, N, M> & /*reducingStrides*/)
{
return TensorOps<ElemType>::Compute(pointers, op); // finally computing something!
}
};
// -----------------------------------------------------------------------
// perform loop over regular index k for N-nary operations (N counting the output)
// -----------------------------------------------------------------------
// The canonical case, vector op without reduction, is this PTX function:
// _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi3ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
// float ^ ^ aggregate loop
// args? ^ ^ input dims
// _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi2ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
// The 'pointers' only refer to a single element, so we will bump them in-place to perform indexing.
template<class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce, C_int k>
struct TensorOpElement
{
// template-recursive version loops over indices
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
const FixedArray<C_unsigned_int, K> & regularOpStrides, const FixedMatrix<C_int, N, K> & regularStrides,
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides,
CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
{
// map id (location on grid) to index[k]
C_size_t stride = regularOpStrides[(C_size_t)k];
C_size_t index = id / stride; // this dimension
id = id % stride; // remaining dimensions inside this
// apply this index to the pointers
for (C_size_t i = 0; i < N; i++)
pointers[i] += index * regularStrides(i,(C_size_t)k); // now this dimension is taken care of
// process the previous index
TensorOpElement<ElemType, N, M, K, parallelReduce, k - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
}
};
// specialization for k=0 where op stride is guaranteed to be 1
template<class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce>
struct TensorOpElement<ElemType, N, M, K, parallelReduce, /*k=*/0>
{
// template-recursive version loops over indices
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
const FixedArray<C_unsigned_int, K> & regularOpStrides, const FixedMatrix<C_int, N, K> & regularStrides,
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides,
CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
{
// map id (location on grid) to index[k]
C_size_t index = id; // this dimension
// apply this index to the pointers
for (C_size_t i = 0; i < N; i++)
pointers[i] += index * regularStrides(i,0); // now this dimension is taken care of
// process the previous index
TensorOpElement<ElemType, N, M, K, parallelReduce, -1>::Compute(/*id*/0, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
}
};
//// apply beta and alpha and save
//template<class ElemType, class PointersType>
//static __device__ void SetFinalValue(ElemType val, ElemType beta, const PointersType & pointers, ElemType alpha)
//{
// // scale
// val *= alpha;
// // combine with previous value in target matrix, then write it out
// auto * pout = pointers[pointers.size() - 1];
// if (beta != 0)
// val += beta * *pout;
// // save
// *pout = val;
//}
// specialization for k = -1 terminates the template recursion, and computes reductions in a for loop
template<class ElemType, C_size_t N, C_int M, C_int K>
struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/false, /*k=*/-1>
{
// template-recursion-teminating version computes the actual value for this output location
// now the output pointers point to the right element (input pointers may still iterate for reduction)
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
const FixedArray<C_unsigned_int, K> & /*regularOpStrides*/, const FixedMatrix<C_int, N, K> & /*regularStrides*/,
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides, CUDA_LONG /*reductionBegin*/, CUDA_LONG /*reductionChunkSize*/)
{
// compute the operation for this output coordinate
// This may still involve a reduction over inverse-broadcasting dimensions.
ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
// scale
val *= alpha;
// combine with previous value in target matrix, then write it out
auto * pout = pointers[pointers.size() - 1];
if (beta != 0)
val += beta * *pout;
// save
*pout = val;
}
};
// specialization for k = -1 terminates the template recursion, and computes reductions in parallel
template<class ElemType, C_size_t N, C_int M, C_int K>
struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
{
// template-recursion-teminating version computes the actual value for this output location
// now the output pointers point to the right element (input pointers may still iterate for reduction)
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
const FixedArray<C_unsigned_int, K> & /*regularOpStrides*/, const FixedMatrix<C_int, N, K> & /*regularStrides*/,
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides, CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
{
CUDA_LONG reductionBlock = blockIdx.z; // block index --larger reductions are split into blocks
CUDA_LONG reductionBlocks = gridDim.z; // number of blocks
CUDA_LONG tid = threadIdx.x; // thread index
CUDA_LONG tids = blockDim.x; // out of how many threads --note: last block is partial
// determine our range --this is a single int mul, we can stomach it (we could alternatively pass in yet another parameter)
CUDA_LONG reductionDim = (CUDA_LONG)reducingOpDims[0];
for (C_size_t i = 1; i < reducingOpDims.size(); i++)
reductionDim *= reducingOpDims[i];
// determine the redId range that we operate on
// Each thread takes a stride tid + (multiples of tids) within this range.
reductionBegin += reductionChunkSize * reductionBlock;
CUDA_LONG reductionEnd = min(reductionBegin + reductionChunkSize, reductionDim);
// compute the operation for this input coordinate
ReduceElemType sum = 0;
for (CUDA_LONG redId = reductionBegin + tid; redId < reductionEnd; redId += tids)
{
auto val = TensorOpParallelReduce<ElemType, N, M, M - 1>::Compute(redId, pointers, op, reducingOpDims, reducingStrides);
sum += val;
}
// reduce --cf https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf
__shared__ ReduceElemType accumulators[GridDim::maxThreadsPerBlock/*tids*/];
accumulators[tid] = sum;
__syncthreads();
static_assert(GridDim::maxThreadsPerBlock <= 512, "GridDim::maxThreadsPerBlock too large, need to add manually unrolled steps");
for (CUDA_LONG i = 256; i; i >>= 1)
{
if (tid < i && tid + i < tids) accumulators[tid] += accumulators[tid + i];
if (0 + i < tids) __syncthreads(); // sync if condition true for at least one thread
// TODO: use volatile* and then we can skip the __syncthreads() for the last 32 values
}
// now set final value to output coordinate
if (tid == 0)
{
ElemType val = (ElemType)accumulators[0];
// scale
val *= alpha;
// combine with previous value in target matrix, then write it out
auto * pout = pointers[pointers.size() - 1];
if (reductionBlocks > 1) // multiple blocks: need to use atomicAdd()
{
// in this case, outer calling code must pass beta = 1
val = atomicAdd(pout, val);
}
else
{
if (beta != 0)
val += beta * *pout;
// save
*pout = val;
}
}
}
};
// -----------------------------------------------------------------------
// kernel and launch --no reduction
// -----------------------------------------------------------------------
// launch tensor op with CUDA
template<class ElemType, C_size_t N, C_int M, C_int K>
__global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides, CUDA_LONG numElements,
FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides)
{
CUDA_LONG id = GridDim::GetLinearThreadId();
if (id < numElements) // note: there are no __syncthread() calls inside
TensorOpElement<ElemType, N, M, K, false, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, 0, 0);
}
template<class ElemType, C_size_t N, C_int K>
static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors)
{
// copy all parameters to CUDA-compatible data structures
FixedArray<ElemType*, N> pointers(pointerVector);
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
C_size_t numElements = 1;
for (C_size_t k = 0; k < regularOpDims.size(); k++)
{
regularOpStrideVector.push_back(numElements);
numElements *= (C_size_t)regularOpDims[k];
}
FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
FixedArray<C_unsigned_int, /*M=*/0> reducingOpDims; // empty reduction dimensions
FixedMatrix<C_int, N, /*M=*/0> reducingStrides;
// launch the kernel
CUDA_LONG NN = (CUDA_LONG)numElements; // linear space identifying each individual input element
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
GridDim grid(NN);
_launchTensorOp<ElemType, N, /*M=*/0, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
}
// -----------------------------------------------------------------------
// kernel and launch --with reduction
// -----------------------------------------------------------------------
template<class ElemType, C_size_t N, C_int M, C_int K>
__global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides, CUDA_LONG numElements,
FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides, CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
{
CUDA_LONG id = gridDim.x * blockIdx.y + blockIdx.x; // input dimensions are Y dimension of blocks in this case, so we can use thread dim for shared-memory/parallelization
if (id < numElements) // note: we have __syncthread() calls but only entire blocks in sync, so this is OK
TensorOpElement<ElemType, N, M, K, true, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
}
// All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
template<class ElemType, C_size_t N, C_int M, C_int K>
static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
{
// copy all parameters to CUDA-compatible data structures
FixedArray<ElemType*, N> pointers(pointerVector);
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
C_size_t numElements = 1;
for (C_size_t k = 0; k < regularOpDims.size(); k++)
{
regularOpStrideVector.push_back(numElements);
numElements *= (C_size_t)regularOpDims[k];
}
FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
// launch the kernel
CUDA_LONG NN = (CUDA_LONG)numElements; // linear space identifying each individual input element
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
// do some optimization for reductions
// Cases:
// - #output elements >= GPU procs --> use one proc per element, do reduction in inner loop
// - reduction dimension fits into a single kernel --> launch it that way
// - reduction dimension requires multiple kernels --> use atomic add, to avoid temp mem alloc
// - PlusNode: reducing to a bias for small matrices
// - ScaleNode: big elementwise product reduced to a scalar (dot product)
// - E.g. 3072 GPU procs:
// If >= 3072 reduced output values must be computed, just loop inside.
// If less, and reduction per value does not fit into a single proc,
// then we break it into procs, say, 24.
// This way we will need 24 atomicAdd()s of 3072/24 = 128 values.
// If reduction is along stride=1, then we'd have 24 atomicAdd()s of 32 coalesced writes.
// Does not sound scary at all.
// Precondition: matrix cannot at the same time participate in reduction and operation.
C_size_t reductionDim = 1; // number of elements to reduce over
for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
reductionDim *= (C_size_t)reducingOpDimVector[k];
let & props = GridDim::GetDeviceProps();
GridDim grid(NN);
if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount /* && NN == 10 && reductionDim <= GridDim::maxThreadsPerBlock*/)
{
// we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
// Change of strategy: All NN elements get their own block. Reduction gets split over blocks as well.
// By how much do we underutilize?
// We increase #blocks by that factor by breaking reduction into that many chunks.
let numReductionChunks = CeilDiv(props.multiProcessorCount, NN);
// NN may be too large for a single dimension
let blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
let numBlocksX = CeilDiv(NN, blockXOverBy);
let numBlocksY = CeilDiv(NN, numBlocksX);
let numBlocksZ = numReductionChunks;
// Block dim is now:
// - X, Y: such that X*Y covers NN
// - Z: reduction chunks
// reduction goes into thread dim X
let reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
let numThreadsX = min(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
if (beta == 1 || numBlocksZ == 1)
{
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
}
else
{
// We need more than one chunk, we will use atomicAdd().
// First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
}
}
else
{
// we got enough elements to generate: do one element per thread, and reduction inside
_launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
}
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
}
// -----------------------------------------------------------------------
// kernel and launch --linear unary
// -----------------------------------------------------------------------
// for linear unary ops, we need to define a functor for every function for use as a template parameter (lambda syntax doesn't work in CUDA 7)
#define DefineUnaryTensorFunctor(oper) \
struct Functor ## oper { template<class ElemType> static __device__ ElemType f(ElemType a) { return Op ## oper(a); } };
ForAllUnaryOps(DefineUnaryTensorFunctor);
// the top-level kernel for linear unary ops
// Note: If we have a beta, we have 2 memory accesses, so this optimization may no longer be needed as we are memory-bound.
template<class ElemType, class FN>
__global__ void _launchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, CUDA_LONG numElements)
{
CUDA_LONG id = GridDim::GetLinearThreadId();
if (id >= numElements)
return;
ElemType a = pa[id];
ElemType val = FN::f(a);
val *= alpha;
if (beta != 0)
val += beta * pb[id];
pb[id] = val;
}
// version without beta and alpha
template<class ElemType, class FN>
__global__ void _launchUnaryTensorOp(const ElemType * pa, ElemType * pb, CUDA_LONG numElements)
{
CUDA_LONG id = GridDim::GetLinearThreadId();
if (id >= numElements)
return;
ElemType a = pa[id];
ElemType val = FN::f(a);
pb[id] = val;
}
// special case of linear unary operation
template<class ElemType>
void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim)
{
//////if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp: %d", (int)__LINE__);
CUDA_LONG NN = (CUDA_LONG)regularOpDim;
#define CaseLaunchUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: \
if (beta == 0 && alpha == 1) \
return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(pa, pb, NN); \
else \
return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pa, pb, alpha, NN);
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
GridDim grid(NN);
switch (op)
{
ForAllUnaryOps(CaseLaunchUnaryTensorOp);
default: LogicError("LaunchTensorOp1: Unknown op code %d.", (int)op);
}
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
}
// -----------------------------------------------------------------------
// map runtime parameters N to template parameters
// -----------------------------------------------------------------------
// tensor operation with k+1 dimensions (-1 means scalar)
template<class ElemType, C_size_t N, C_int K>
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrides,
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
{
size_t dims = reducingOpDims.size();
switch (dims)
{
case 2: return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1: return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0: return LaunchTensorOp<ElemType, N, K>(beta, pointers, alpha, op, regularOpDims, regularStrides);
default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (C_int)dims);
}
}
// tensor operation, generalized in number of arguments
// This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
template<class ElemType, C_size_t N>
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
const array<size_t, N> & offsets,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrides,
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
{
for (C_size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
pointers[i] += offsets[i];
size_t dims = regularOpDims.size();
switch (dims)
{
case 4: return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 3: return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 2: return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 1: return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
case 0: return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (C_int)dims);
}
}
//------------------------------------------------------------------------
// explicit instantiations--these are being called from GPUMatrix.cu
//------------------------------------------------------------------------
template void TensorOpN<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op,
const array<size_t, 2> & offsets,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 2> & regularStrides,
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 2> & reducingStrides);
template void TensorOpN<float, 3>(float beta, array<float*, 3> pointers, float alpha, ElementWiseOperator op,
const array<size_t, 3> & offsets,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 3> & regularStrides,
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 3> & reducingStrides);
template void TensorOpN<float, 4>(float beta, array<float*, 4> pointers, float alpha, ElementWiseOperator op,
const array<size_t, 4> & offsets,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 4> & regularStrides,
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 4> & reducingStrides);
template void TensorOpN<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op,
const array<size_t, 2> & offsets,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 2> & regularStrides,
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 2> & reducingStrides);
template void TensorOpN<double, 3>(double beta, array<double*, 3> pointers, double alpha, ElementWiseOperator op,
const array<size_t, 3> & offsets,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 3> & regularStrides,
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 3> & reducingStrides);
template void TensorOpN<double, 4>(double beta, array<double*, 4> pointers, double alpha, ElementWiseOperator op,
const array<size_t, 4> & offsets,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 4> & regularStrides,
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 4> & reducingStrides);
template void LaunchUnaryTensorOp(float beta, const float * pa, float * pb, float alpha, ElementWiseOperator op, size_t regularOpDim);
template void LaunchUnaryTensorOp(double beta, const double * pa, double * pb, double alpha, ElementWiseOperator op, size_t regularOpDim);
}}}
#endif // CPUONLY

30
Source/Math/GPUTensor.h Normal file
Просмотреть файл

@ -0,0 +1,30 @@
//
// <copyright file="GPUTensor.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
#pragma once
#include "CommonMatrix.h"
#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
#include "GPUMatrixCUDAKernels.cuh"
#include <array>
namespace Microsoft { namespace MSR { namespace CNTK {
// GPUMatrix::TensorOp() interfaces with actual tensor code through these two functions, which are independent of the GPUMatrix class
#define C_size_t CUDA_LONG
#define C_int CUDA_LONG
#define C_unsigned_int CUDA_LONG
template<class ElemType, C_size_t N>
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
const array<size_t, N> & offsets,
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrides,
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides);
template<class ElemType>
void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim);
}}}

Просмотреть файл

@ -156,7 +156,7 @@
</ProjectReference>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="..\Common\Include\DataTensor.h" />
<ClInclude Include="..\Common\Include\TensorShape.h" />
<ClInclude Include="..\Common\Include\File.h" />
<ClInclude Include="..\Common\Include\fileutil.h" />
<ClInclude Include="..\Common\Include\DebugUtil.h" />

Просмотреть файл

@ -1,9 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="dllmain.cpp" />
<ClCompile Include="Matrix.cpp" />
<ClCompile Include="stdafx.cpp" />
<ClCompile Include="..\Common\File.cpp">
<Filter>Common</Filter>
</ClCompile>
@ -25,22 +23,31 @@
<ClCompile Include="MatrixQuantizerCPU.cpp">
<Filter>CPU\1bitSGD</Filter>
</ClCompile>
<ClCompile Include="MatrixQuantizer.cpp" />
<ClCompile Include="QuantizedMatrix.cpp" />
<ClCompile Include="CUDAPageLockedMemAllocator.cpp">
<Filter>GPU\1bitSGD</Filter>
</ClCompile>
<ClCompile Include="ConvolutionEngine.cpp" />
<ClCompile Include="TensorView.cpp">
<Filter>Tensors</Filter>
</ClCompile>
<ClCompile Include="dllmain.cpp">
<Filter>Misc</Filter>
</ClCompile>
<ClCompile Include="ConvolutionEngine.cpp">
<Filter>Convolution</Filter>
</ClCompile>
<ClCompile Include="stdafx.cpp">
<Filter>Misc</Filter>
</ClCompile>
<ClCompile Include="QuantizedMatrix.cpp">
<Filter>1bitSGD</Filter>
</ClCompile>
<ClCompile Include="MatrixQuantizer.cpp">
<Filter>1bitSGD</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="CommonMatrix.h" />
<ClInclude Include="Helpers.h" />
<ClInclude Include="Matrix.h" />
<ClInclude Include="stdafx.h" />
<ClInclude Include="targetver.h" />
<ClInclude Include="..\Common\Include\File.h">
<Filter>Common\Include</Filter>
</ClInclude>
@ -59,23 +66,40 @@
<ClInclude Include="MatrixQuantizerCPU.h">
<Filter>CPU\1bitSGD</Filter>
</ClInclude>
<ClInclude Include="MatrixQuantizer.h" />
<ClInclude Include="QuantizedMatrix.h" />
<ClInclude Include="MemAllocator.h" />
<ClInclude Include="CUDAPageLockedMemAllocator.h">
<Filter>GPU\1bitSGD</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\DebugUtil.h" />
<ClInclude Include="ConvolutionEngine.h" />
<ClInclude Include="TensorView.h">
<Filter>Tensors</Filter>
</ClInclude>
<ClInclude Include="TensorOps.h">
<Filter>Tensors</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\DataTensor.h">
<ClInclude Include="..\Common\Include\TensorShape.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="Helpers.h">
<Filter>Misc</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\DebugUtil.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="ConvolutionEngine.h">
<Filter>Convolution</Filter>
</ClInclude>
<ClInclude Include="stdafx.h">
<Filter>Misc</Filter>
</ClInclude>
<ClInclude Include="targetver.h">
<Filter>Misc</Filter>
</ClInclude>
<ClInclude Include="QuantizedMatrix.h">
<Filter>1bitSGD</Filter>
</ClInclude>
<ClInclude Include="MatrixQuantizer.h">
<Filter>1bitSGD</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="GPUMatrix.h">
@ -113,5 +137,14 @@
<Filter Include="Tensors">
<UniqueIdentifier>{70fb07cf-603e-4444-bc10-f0add4920fd2}</UniqueIdentifier>
</Filter>
<Filter Include="Misc">
<UniqueIdentifier>{62b92193-92d0-4e5b-8c3e-67ffd01a98c0}</UniqueIdentifier>
</Filter>
<Filter Include="Convolution">
<UniqueIdentifier>{3a49e94d-14ee-4ca1-a56e-a1472206a076}</UniqueIdentifier>
</Filter>
<Filter Include="1bitSGD">
<UniqueIdentifier>{546cacbd-253e-485b-8c8c-8b9ee0e2f631}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>

Просмотреть файл

@ -157,7 +157,9 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
<ClInclude Include="cudalatticeops.h" />
<ClInclude Include="cudalib.h" />
<ClInclude Include="CuDnnConvolutionEngine.h" />
<ClInclude Include="GPUTensor.h" />
<ClInclude Include="latticefunctionskernels.h" />
<ClInclude Include="TensorOps.h" />
<ClInclude Include="ValueQuantizer.h" />
<None Include="GPUWatcher.h">
<FileType>CppHeader</FileType>
@ -171,6 +173,10 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
<ClInclude Include="targetver.h" />
</ItemGroup>
<ItemGroup>
<CudaCompile Include="GPUTensor.cu">
<InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</InterleaveSourceInPTX>
<Keep Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</Keep>
</CudaCompile>
<CudaCompile Include="cudalatticeops.cu">
<FileType>CppCode</FileType>
</CudaCompile>
@ -202,7 +208,7 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
<CudaCompile Include="GPUMatrix.cu">
<FileType>CppCode</FileType>
<Keep Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</Keep>
<InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</InterleaveSourceInPTX>
<InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</InterleaveSourceInPTX>
</CudaCompile>
<CudaCompile Include="GPUMatrixCUDAKernels.cuh">
<ExcludedFromBuild>true</ExcludedFromBuild>

Просмотреть файл

@ -22,25 +22,28 @@
<CudaCompile Include="GPUMatrixCUDAKernels.cuh">
<Filter>GPU</Filter>
</CudaCompile>
<CudaCompile Include="GPUTensor.cu">
<Filter>GPU\Tensors</Filter>
</CudaCompile>
</ItemGroup>
<ItemGroup>
<ClCompile Include="stdafx.cpp" />
<ClCompile Include="cudalattice.cpp">
<Filter>GPU\SequenceTraining</Filter>
</ClCompile>
<ClCompile Include="cudalib.cpp">
<Filter>GPU\SequenceTraining</Filter>
</ClCompile>
<ClCompile Include="..\Common\DebugUtil.cpp" />
<ClCompile Include="..\Common\DebugUtil.cpp">
<Filter>Misc</Filter>
</ClCompile>
<ClCompile Include="stdafx.cpp">
<Filter>Misc</Filter>
</ClCompile>
<ClCompile Include="CuDnnConvolutionEngine.cpp">
<Filter>GPU</Filter>
<Filter>GPU\Convolution</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="CommonMatrix.h" />
<ClInclude Include="Helpers.h" />
<ClInclude Include="stdafx.h" />
<ClInclude Include="targetver.h" />
<ClInclude Include="..\Common\Include\File.h">
<Filter>Common\Include</Filter>
</ClInclude>
@ -80,8 +83,26 @@
<ClInclude Include="latticefunctionskernels.h">
<Filter>GPU\SequenceTraining</Filter>
</ClInclude>
<ClInclude Include="GPUTensor.h">
<Filter>GPU\Tensors</Filter>
</ClInclude>
<ClInclude Include="Helpers.h">
<Filter>Misc</Filter>
</ClInclude>
<ClInclude Include="stdafx.h">
<Filter>Misc</Filter>
</ClInclude>
<ClInclude Include="targetver.h">
<Filter>Misc</Filter>
</ClInclude>
<ClInclude Include="CommonMatrix.h">
<Filter>from Math</Filter>
</ClInclude>
<ClInclude Include="CuDnnConvolutionEngine.h">
<Filter>GPU</Filter>
<Filter>GPU\Convolution</Filter>
</ClInclude>
<ClInclude Include="TensorOps.h">
<Filter>from Math</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
@ -105,14 +126,23 @@
<Filter Include="GPU">
<UniqueIdentifier>{cc9a219d-d8ab-484a-b253-fd2a29ad7c7c}</UniqueIdentifier>
</Filter>
<Filter Include="Include">
<UniqueIdentifier>{3c982109-64b1-469a-8d85-2abdf12d636a}</UniqueIdentifier>
</Filter>
<Filter Include="GPU\1bitSGD">
<UniqueIdentifier>{3415233d-9ef7-41c6-abbb-cec1b4f8d14c}</UniqueIdentifier>
</Filter>
<Filter Include="GPU\SequenceTraining">
<UniqueIdentifier>{6a3569b1-6c9e-47b3-870f-bb581349e75e}</UniqueIdentifier>
</Filter>
<Filter Include="Misc">
<UniqueIdentifier>{3c982109-64b1-469a-8d85-2abdf12d636a}</UniqueIdentifier>
</Filter>
<Filter Include="GPU\Tensors">
<UniqueIdentifier>{16214e65-2d24-4e4c-a0dd-c37e505bda32}</UniqueIdentifier>
</Filter>
<Filter Include="from Math">
<UniqueIdentifier>{b1b59e2e-5c54-4e40-ad0a-1523ddeb63ba}</UniqueIdentifier>
</Filter>
<Filter Include="GPU\Convolution">
<UniqueIdentifier>{3155488f-128f-494e-858d-459b4cc9fab7}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>

Просмотреть файл

@ -3152,6 +3152,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
);
}
template<class ElemType>
bool Matrix<ElemType>::IsValid() const
{
if (m_currentDataLocation == CurrentDataLocation::GPU && GetMatrixType() == MatrixType::SPARSE)
{
return this->m_GPUSparseMatrix->IsValid();
}
else
{
NOT_IMPLEMENTED;
}
return false;
}
template<class ElemType>
bool Matrix<ElemType>::IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold /*= 1e-8*/) const
{
@ -4321,7 +4336,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template<class ElemType>
void Matrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB,
ElemType beta, Matrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
ElemType beta, Matrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
{
DecideAndMoveToRightDevice(a, b, c);

Просмотреть файл

@ -13,7 +13,7 @@
#include "Basics.h"
#include "File.h"
#include "CommonMatrix.h"
#include "DataTensor.h" // only for SmallVector; I was hoping to keep this out
#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
#include <limits.h>
#include <memory> // for shared_ptr
#include <array>
@ -348,7 +348,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Matrix<ElemType>& AssignPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
Matrix<ElemType>& AddFoldedPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
bool IsValid() const;
bool IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold = 1e-8) const;
static void VectorSum(const Matrix<ElemType>& a, Matrix<ElemType>& c, const bool isColWise);
@ -437,7 +438,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static void Multiply(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
static void Multiply(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
static void Multiply1x1AndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType beta, Matrix<ElemType>& c);
static void ConvolveAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, ElemType beta, Matrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise);
static void ConvolveAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, ElemType beta, Matrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise);
static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, ElemType beta, Matrix<ElemType>& c);

Просмотреть файл

@ -13,7 +13,7 @@
#include "GPUSparseMatrix.h"
#include "MatrixQuantizerGPU.h"
#include "CuDnnConvolutionEngine.h"
#include "DataTensor.h"
#include "TensorShape.h"
#pragma warning (disable: 4100) // unreferenced formal parameter, which is OK since all functions in here are dummies; disabling this allows to copy-paste prototypes here when we add new functions
#pragma warning (disable: 4702) // unreachable code, which we get from the NOT_IMPLEMENTED macro which is OK
@ -368,10 +368,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType> void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat) {}
template<class ElemType> void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const {}
template<class ElemType> void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise) { };
template<class ElemType> void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise) { };
template<class ElemType> void GPUSparseMatrix<ElemType>::TensorShuffleScaleAndAdd(ElemType keepWeight, const GPUSparseMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c) { }
template<class ElemType> void GPUSparseMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols) { }
template<class ElemType> bool GPUSparseMatrix<ElemType>::IsValid() const { return true; }
template<class ElemType> template <class OutType, class InType>
void GPUSparseMatrix<ElemType>::CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size){}

Просмотреть файл

@ -25,18 +25,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// unified overloads for float/double math functions
//
// Declare float and double versions of the functions f we need as f_(),
// e.g. exp_ -> exp(double), expf(float).
// Declare float and double versions of the functions x we need as x_().
// This macro overloads x_() with float and double arguments, and inlines the correct library function,
// e.g. exp_ -> exp(double), expf(float). This simplifies templated kernel code.
// -----------------------------------------------------------------------
#pragma push_macro("OverloadUnaryMathFns")
#define OverloadUnaryMathFns(func) \
DECL float func ## _(float arg) { return func ## f(arg); } \
DECL double func ## _(double arg) { return func(arg); }
#define OverloadUnaryMathFns(x) DECL float x ## _(float f) { return x ## f(f); } DECL double x ## _(double f) { return x(f); }
OverloadUnaryMathFns(exp);
OverloadUnaryMathFns(log);
OverloadUnaryMathFns(tanh);
OverloadUnaryMathFns(sqrt);
OverloadUnaryMathFns(fabs);
OverloadUnaryMathFns(cos);
OverloadUnaryMathFns(sin);
OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt);
OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log);
OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin);
#pragma push_macro("OverloadUnaryMathFns")
// -----------------------------------------------------------------------
@ -46,6 +50,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
DECL ElemType Sigmoid(ElemType z)
{
#if 1 // BUGBUG: Numerically bad. But if I don't use this, results change.
ElemType negElem = -z;
ElemType e = exp_(negElem);
return 1 / (e + 1);
#else
#if 1 // Efficient implementation that avoids to divergent CUDA code paths that both compute exp() [jdroppo]. This version compiles to PTX without branches.
ElemType q = exp_(-fabs_(z));
ElemType numer;
@ -62,6 +72,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ElemType v = exp_(z);
return v / (1 + v);
}
#endif
#endif
}
@ -85,7 +96,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return sqrt_(z > 0 ? z : 0);
}
// TODO: call this LogAdd() for consistency
template<class ElemType>
DECL ElemType ClippedLog(ElemType z)
{
return z < EPS_IN_LOG ? LOG_OF_EPS_IN_LOG : log_(z);
}
template<class ElemType>
DECL ElemType ClippedQuotient(ElemType a, ElemType b)
{
if (fabs(b) < EPS_IN_INVERSE) // clip the denominator
{
if (b > 0)
b = EPS_IN_INVERSE;
else
b = -EPS_IN_INVERSE;
}
return a / b;
}
template<typename ElemType>
DECL ElemType LogAdd(ElemType x, ElemType y)
{
@ -105,37 +134,59 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
template<class ElemType> DECL ElemType Sqr(ElemType z) { return z * z; }
// IndexElement reindexes a tensor along one dimension.
// For the indexed dimension, the tensor op is prepared by setting 'a' to be broadcasting along the indexed dimension.
// I.e. pa = &a points to the first element (as if index == 0).
// This function then must now adjust the address:
// pa <- pa + stride * index
// The stride is passed in as third parameter.
//template<class ElemType> DECL ElemType IndexElement(const ElemType & a, ElemType b, int stride) { const ElemType * pa = &a; return pa[stride * (ptrdiff_t)b]; }
// -----------------------------------------------------------------------
// ElementWiseOperator implementations
//
// Define a static function for every ElementWiseOperator (CommonMatrix.h).
// -----------------------------------------------------------------------
#pragma push_macro("DefNullaryOp")
#define DefNullaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op() { return expr; }
DefNullaryOp(ConstOne, 1);
#pragma pop_macro("DefNullaryOp")
#pragma push_macro("DefUnaryOp")
#define DefUnaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a) { return expr; }
DefUnaryOp(Copy, a);
DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a);
DefUnaryOp(Abs, fabs_(a));
DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a));
DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, ClippedLog(a)); DefUnaryOp(LinearRectifier, a > 0 ? a : 0); DefUnaryOp(Cosine, cos_(a));
#pragma pop_macro("DefUnaryOp")
// parameterized unary ops
//DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha);
#pragma push_macro("DefBinaryOp")
#define DefBinaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b) { return expr; }
//#define DefBinaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(const ElemType & a, ElemType b, int i = 0) { UNUSED(i); return expr; }
DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementwiseProduct, a * b); DefBinaryOp(ElementwiseQuotient, a / b);
DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementwiseProduct, a * b); DefBinaryOp(ElementwiseQuotient, ClippedQuotient(a, b));
DefBinaryOp(LogSum, LogAdd(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b);
DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b);
DefBinaryOp(And, (float)((!!a) && (!!b))); DefBinaryOp(Or, (float)((!!a) || (!!b))); DefBinaryOp(Xor, (float)((!!a) ^ (!!b)));
DefBinaryOp(MaskNegative, b >= 0 ? a : 0);
DefBinaryOp(ElementwiseProductWithSigmoidDerivativeFromOutput, a * (b * (1 - b))); // b = output
DefBinaryOp(ElementwiseProductWithTanhDerivativeFromOutput, a * (1 - b * b));
DefBinaryOp(ElementwiseProductWithLinearRectifierDerivativeFromOutput, b > 0 ? a : 0);
DefBinaryOp(ElementwiseProductWithLogDerivativeFromOutput, a * exp_(-b));
DefBinaryOp(ElementwiseProductWithCosDerivative, a * -sin_(b)); // note: b = input for cos()
//DefBinaryOp(Index, IndexElement(a, b, i)); // note: this one uses the third argument
#pragma pop_macro("DefBinaryOp")
#pragma push_macro("DefTernaryOp")
#define DefTernaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; }
DefTernaryOp(Cond, a ? b : c);
DefTernaryOp(Cond, a ? b : c); DefTernaryOp(Clip, a < b ? b : (a > c ? c : a));
#pragma pop_macro("DefTernaryOp")
}}}

Просмотреть файл

@ -223,6 +223,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
offsets[i] = shapes[i].GetOffset();
}
// enforce that in case of broadcasting, the output must not be an input
template<class ElemType>
static bool CheckDifferentObject(const TensorView<ElemType> & a, const TensorView<ElemType> & b)
{
if (&a == &b)
LogicError("Do{U,Bi,Ter}naryOpOf: When inverse broadcasting, output must not be an input.");
return true;
}
template<class ElemType>
void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
{
@ -235,6 +244,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SmallVector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType,2>(array<TensorShape, 2> { a.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// output cannot be input when reducing
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this);
// now perform the operation
GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
@ -250,6 +263,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SmallVector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType, 3>(array<TensorShape, 3> { a.GetShape(), b.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// output cannot be input when reducing
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this);
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
@ -264,6 +281,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SmallVector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType, 4>(array<TensorShape, 4> { a.GetShape(), b.GetShape(), c.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// output cannot be input when reducing
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this) && CheckDifferentObject(c, *this);
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}

Просмотреть файл

@ -10,7 +10,7 @@
#include "Basics.h"
#include "Matrix.h"
#include "DataTensor.h"
#include "TensorShape.h"
#pragma warning (push)
#pragma warning (disable: 4251) // needs to have dll-interface to be used by clients of... caused by TensorView::m_shape which is only private. We use the same compiler everywhere.
@ -48,7 +48,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// c.AssignDiffOf(c,a) means c -= a,
// and c.AddElementwiseProductOf(a, b, 1) means c += a .* b.
// All operators support elementwise in-place operations, i.e. a, b, and c
// may all reference the same underlying SOB.
// may all reference the same underlying SOB, with onee exception:
// The output cannot be in-place and inverse-broadcasting at the same time.
// E.g. with c=[10] and a=[10 x 20], c.AssignDiffOf(c,a) will fail.
// In that case, you can use c.AddCopyOf(a,-1).
// Aliasing is not detected, so don't pass distinct TensorView objects that
// reference overlapping but not identical slices.
// If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
// -------------------------------------------------------------------
@ -59,7 +64,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void Add ## oper ## Of( const TensorView & a, ElemType alpha = 1.0f) { DoUnaryOpOf(1.0f, a, alpha, ElementWiseOperator::op ## oper); }
ForAllUnaryOps(DeclareUnaryTensorOp);
ForAllParameterizedUnaryOps(DeclareUnaryTensorOp);
#pragma pop_macro("DeclareUnaryTensorOp")
#pragma push_macro("DeclareBinaryTensorOp")
@ -82,12 +86,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static void Test();
private:
void DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op);
void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op);
void DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op);
private:
// -------------------------------------------------------------------
// accessors
// -------------------------------------------------------------------

Просмотреть файл

@ -2593,6 +2593,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// the total number of epochs to run.
m_maxEpochs = configSGD(L"maxEpochs");
// Note: Momentum is best specified as a MB-size agnostic fashion.
// Because momentum per sample is a number very close to 1, it is more handy to use a logarithmic specification.
// We use 'momentumAsTimeConstant' to specify the time constant of the low-pass filter that momentum really is.
// To convert a typical per-MB momentum value of 'm' used with a MB size of 'N', use momentumAsTimeConstant = -N/ln(m).
// For the common configuration of momentum 0.9 at MB size of 256, that is momentumAsTimeConstant = 2429.8.
floatargvector momentumPerMB = configSGD(L"momentumPerMB", ConfigRecordType::Array(floatargvector()));
floatargvector momentumPerSample = configSGD(L"momentumPerSample", ConfigRecordType::Array(floatargvector()));
floatargvector momentumAsTimeConstant = configSGD(L"momentumAsTimeConstant", ConfigRecordType::Array(floatargvector()));

Просмотреть файл

@ -156,7 +156,7 @@
<ClInclude Include="..\Common\Include\BestGpu.h" />
<ClInclude Include="..\Common\Include\Config.h" />
<ClInclude Include="..\Common\Include\DataReader.h" />
<ClInclude Include="..\Common\Include\DataTensor.h" />
<ClInclude Include="..\Common\Include\TensorShape.h" />
<ClInclude Include="..\Common\Include\DataWriter.h" />
<ClInclude Include="..\Common\Include\File.h" />
<ClInclude Include="..\Common\Include\fileutil.h" />

Просмотреть файл

@ -141,7 +141,7 @@
<ClInclude Include="..\Common\Include\Sequences.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\DataTensor.h">
<ClInclude Include="..\Common\Include\TensorShape.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\Config.h">
@ -195,4 +195,4 @@
<UniqueIdentifier>{ae1eea3c-d77f-46ec-bf4f-1cd093a295e8}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
</Project>

Просмотреть файл

@ -6,7 +6,7 @@ ndlMnistMacros = [
ImageH = 28
LabelDim = 10
features = ImageInput(ImageW, ImageH, 1, tag="feature")
features = ImageInput(ImageW, ImageH, 1, imageLayout="legacy", tag="feature")
featScale = Const(0.00390625)
featScaled = Scale(featScale, features)
labels = Input(LabelDim, tag="label")
@ -28,7 +28,7 @@ DNN=[
pool1H = 2
pool1hStride = 2
pool1vStride = 2
pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride)
pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout="legacy")
# conv2
kW2 = 5
@ -45,7 +45,7 @@ DNN=[
pool2H = 2
pool2hStride = 2
pool2vStride = 2
pool2 = AveragePooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride)
pool2 = AveragePooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout="legacy")
h1Dim = 128
# DNNSigmoidLayer and DNNLayer are defined in Macros.ndl

Просмотреть файл

@ -1,3 +1,4 @@
# Sigmoid non-linearity
DNNSigmoidLayer(inDim, outDim, x, parmScale) = [
W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale)
b = Parameter(outDim, 1, init="uniform", initValueScale=parmScale)
@ -6,6 +7,7 @@ DNNSigmoidLayer(inDim, outDim, x, parmScale) = [
y = Sigmoid(z)
]
# no non-linearity, as input for SoftMax
DNNLayer(inDim, outDim, x, parmScale) = [
W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale)
b = Parameter(outDim, 1, init="uniform", initValueScale=parmScale)
@ -13,10 +15,11 @@ DNNLayer(inDim, outDim, x, parmScale) = [
z = Plus(t, b)
]
# ReLU non-linearity
ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
convW = Parameter(outMap, inWCount, init="uniform", initValueScale=wScale)
conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false)
convB = Parameter(outMap, 1, init="fixedValue", value=bValue)
convB = ImageParameter(1, 1, outMap, imageLayout="legacy", init="fixedValue", value=bValue)
convPlusB = Plus(conv, convB);
act = RectifiedLinear(convPlusB);
]

Просмотреть файл

@ -1,7 +1,10 @@
#precision = "double"
precision = "float"
command = train:test
deviceId = $DeviceId$
useCuDnn = true # can be overridden by the command line
ndlMacros = "$ConfigDir$/Macros.ndl"
parallelTrain = false
@ -13,8 +16,94 @@ train = [
#deviceId = $DeviceId$
traceLevel = 1
NDLNetworkBuilder = [
networkDescription = "$ConfigDir$/Convolution.ndl"
#NDLNetworkBuilder = [
# networkDescription = "$ConfigDir$/Convolution.ndl"
#]
BrainScriptNetworkBuilder = [
useCuDnn = $useCuDnn$
// HACK to enforce same evaluation order or LearnableParameters as for NDL, as to get same radomization
// Nodes are evaluated in sorting order.
A1 = conv1_act; A2 = conv2_act; A3 = h1 ; A5 = ol
// macros
ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [ // ReLU non-linearity
convW = Parameter(outMap, inWCount, init="uniform", initValueScale=wScale, initOnCPUOnly=false)
conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false, imageLayout=if useCuDnn then "cudnn" else "legacy")
convB = if useCuDnn
then ParameterTensor((1 : 1 : outMap : 1/*col dim*/), init="fixedValue", value=bValue)
else Parameter(outMap, 1, init="fixedValue", value=bValue)
convPlusB = Plus(conv, convB);
out = RectifiedLinear(convPlusB);
]
DNNSigmoidLayer(inDim, outDim, x, parmScale) = [ // Sigmoid non-linearity
W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=false)
b = Parameter(outDim, 1, init="uniform", initValueScale=parmScale, initOnCPUOnly=false)
t = Times(W, x)
z = Plus(t, b)
out = Sigmoid(z)
]
DNNLayer(inDim, outDim, x, parmScale) = [ //no non-linearity, as input for SoftMax
W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=false)
b = Parameter(outDim, 1, init="uniform", initValueScale=parmScale, initOnCPUOnly=false)
t = Times(W, x)
out = Plus(t, b)
]
imageW = 28
imageH = 28
labelDim = 10
features = ImageInput(imageW, imageH, 1, imageLayout=if useCuDnn then "cudnn" else "legacy", tag="feature")
featScale = Constant(0.00390625)
featScaled = Scale(featScale, features)
labels = Input(labelDim, tag="label")
# conv1
kW1 = 5
kH1 = 5
cMap1 = 16
hStride1 = 1
vStride1 = 1
# weight[cMap1, kW1 * kH1 * inputChannels]
conv1_act = ConvReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, 10, 1).out
# pool1
pool1W = 2
pool1H = 2
pool1hStride = 2
pool1vStride = 2
pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=if useCuDnn then "cudnn" else "legacy")
# conv2
kW2 = 5
kH2 = 5
cMap2 = 32
hStride2 = 1
vStride2 = 1
# weight[cMap2, kW2 * kH2 * cMap1]
# ConvReLULayer is defined in Macros.ndl
conv2_act = ConvReLULayer(pool1, cMap2, 400, kW2, kH2, hStride2, vStride2, 10, 1).out
# pool2
pool2W = 2
pool2H = 2
pool2hStride = 2
pool2vStride = 2
pool2 = AveragePooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout=if useCuDnn then "cudnn" else "legacy")
h1Dim = 128
# DNNSigmoidLayer and DNNLayer are defined in Macros.ndl
h1 = DNNSigmoidLayer(512, h1Dim, pool2, 1).out
ol = DNNLayer(h1Dim, labelDim, h1, 1).out
ce = CrossEntropyWithSoftmax(labels, ol, tag="criterion")
err = ErrorPrediction(labels, ol, tag="eval")
outputNodes = ol
]
SGD = [

Просмотреть файл

@ -66,8 +66,8 @@ speechTrain = [
C(c) = DiagTimes(WeightParam(cellDim, 1), Stabilize(c)) // cell-to-hiddden
// LSTM cell
dh = PastValue(outputDim, 1, output); // hidden state(t-1)
dc = PastValue(cellDim, 1, ct); // cell(t-1)
dh = PastValue(outputDim, output); // hidden state(t-1)
dc = PastValue(cellDim, ct); // cell(t-1)
// note: the W(inputx) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B()
it = Sigmoid(W(inputx) + B() + H(dh) + C(dc)) // input gate(t)
@ -95,8 +95,8 @@ speechTrain = [
numLSTMs = 3 // number of hidden LSTM model layers
// features
features = Input(featDim, 1, tag='feature')
labels = Input(labelDim, 1, tag='label')
features = Input(featDim, tag='feature')
labels = Input(labelDim, tag='label')
feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); # shift 5 frames right (x_{t+5} -> x_{t} ) // TODO why 5? Where do I see this?
featNorm = MeanVarNorm(feashift)

Просмотреть файл

@ -74,8 +74,8 @@ speechTrain = new TrainAction [
C(c) = DiagTimes(WeightParam(cellDim, 1), Stabilize(c)) // cell-to-hiddden
// LSTM cell
dh = PastValue(outputDim, 1, output); // hidden state(t-1)
dc = PastValue(cellDim, 1, ct); // cell(t-1)
dh = PastValue(outputDim, output); // hidden state(t-1)
dc = PastValue(cellDim, ct); // cell(t-1)
// note: the W(inputx) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B()
it = Sigmoid(W(inputx) + B() + H(dh) + C(dc)) // input gate(t)

Просмотреть файл

@ -27,6 +27,8 @@ Using parallel sequences (difference to above: nbruttsineachrecurrentiter=4). No
COMMAND: currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data configFile=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM\cntk.config stderr=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log RunDir=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance NdlDir=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM DataDir=. DeviceId=auto Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=4]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[learningRatesPerMB=0.125]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] makeMode=false
Linux: bin/cntk currentDirectory=Tests/EndToEndTests/Speech/Data configFile=../LSTM/cntk.config stderr=../RunDir/LSTM/Truncated/models/cntkSpeech.dnn.log RunDir=../RunDir/LSTM/Truncated NdlDir=../LSTM DataDir=. DeviceId=auto Truncated=false 'speechTrain=[reader=[nbruttsineachrecurrentiter=4]]' 'speechTrain=[SGD=[epochSize=2560]]' 'speechTrain=[SGD=[learningRatesPerMB=0.125]]' 'speechTrain=[SGD=[maxEpochs=2]]' 'speechTrain=[SGD=[numMBsToShowResult=1]]' makeMode=false
Using full BrainScript configuration
COMMAND: --cd $(SolutionDir)Tests\EndToEndTests\Speech\Data -f $(SolutionDir)Tests\EndToEndTests\Speech\LSTM\lstm.bs -D stderr='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log' -D RunDir='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance' -D NdlDir='$(SolutionDir)Tests\EndToEndTests\Speech\LSTM' -D DataDir='.' -D DeviceId='Auto' -D Truncated=false -D speechTrain=[reader=[nbruttsineachrecurrentiter=1];SGD=[epochSize=2560;maxEpochs=2;numMBsToShowResult=1]] -D makeMode=false
@ -46,7 +48,7 @@ COMMAND: currentDirectory=$(SolutionDir)ExampleSetups\Image\MNIST configFil
--- Image/QuickE2E:
COMMAND: configFile=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E\cntk.config RunDir=$(SolutionDir)Tests\EndToEndTests\Image\_run DataDir=$(SolutionDir)Tests\EndToEndTests\Image\Data ConfigDir=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E stderr=$(SolutionDir)Tests\EndToEndTests\RunDir\Image\QuickE2E\models\cntkImage.dnn.log DeviceId=0 makeMode=false
COMMAND: configFile=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E\cntk.config RunDir=$(SolutionDir)Tests\EndToEndTests\Image\_run DataDir=$(SolutionDir)Tests\EndToEndTests\Image\Data ConfigDir=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E stderr=$(SolutionDir)Tests\EndToEndTests\RunDir\Image\QuickE2E\models\cntkImage.dnn.log DeviceId=0 useCuDnn=false makeMode=false
Simple test
-----------

Просмотреть файл

@ -24,14 +24,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
static bool IsCuDnnSupported()
{
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
try
{
return ConvFact::Create(0, ConvFact::EngineType::CuDnn) != nullptr;
// TODO: Will this ever return nullptr?
return ConvFact::Create(0, ConvFact::EngineType::CuDnn, ImageLayoutKind::CHW) != nullptr;
}
catch (std::runtime_error)
{
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
return false;
}
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
}
BOOST_AUTO_TEST_SUITE(ConvolutionSuite)
@ -55,7 +59,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
for (int deviceId : { 0 })
{
auto fact = ConvFact::Create(deviceId);
// BUGBUG: These will fail depending on whether we built with cuDNN or not. Without cuDNN we should use HWC
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
auto tt = typeid(fact).name();
UNUSED(tt);
auto eng = fact->CreateConvEngine(deviceId, 0);
@ -128,14 +133,22 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
for (int deviceId : { -1, 0 })
{
auto fact = ConvFact::Create(deviceId);
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, deviceId >= 0 ? ImageLayoutKind::CHW : ImageLayoutKind::HWC);
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
auto eng = fact->CreateConvEngine(deviceId, 0);
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
auto inT = fact->CreateTensor(inW, inH, cmapIn, n);
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
auto outT = fact->CreateTensor(outW, outH, cmapOut, n);
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
auto convT = fact->CreateConvDescriptor(*inT, *filtT, sW, sH, pad);
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
// Input in NCHW format.
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
SingleMatrix in(inW * inH * cmapIn, n, vec(inW * inH * cmapIn * n, 1.0f).data(), matrixFlagNormal, deviceId);
// Create cmapOut filters, each kW x kH x cmapIn (NCHW format).
SingleMatrix filt(cmapOut, kW * kH * cmapIn, vec(kW * kH * cmapIn * cmapOut, 1.0f).data(), matrixFlagNormal, deviceId);
@ -143,7 +156,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
SingleMatrix out(outW * outH * cmapOut, n, deviceId);
SingleMatrix temp(deviceId);
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
eng->Forward(*inT, in, *filtT, filt, *convT, *outT, out, temp);
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
// Output is in NCHW format.
float expBuf[] = {
@ -175,7 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
for (int deviceId : { 0 })
{
auto fact = ConvFact::Create(deviceId);
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
auto eng = fact->CreateConvEngine(deviceId, 0);
auto srcGradT = fact->CreateTensor(outW, outH, cmapOut, n);
auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
@ -231,7 +246,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
for (int deviceId : { 0 })
{
auto fact = ConvFact::Create(deviceId);
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
auto eng = fact->CreateConvEngine(deviceId, 0);
auto srcGradT = fact->CreateTensor(outW, outH, cmapOut, n);
auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
@ -296,7 +311,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
for (int deviceId : { 0 })
{
auto fact = ConvFact::Create(deviceId);
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
auto eng = fact->CreatePoolEngine(deviceId);
auto inT = fact->CreateTensor(inW, inH, cmap, n);
auto outT = fact->CreateTensor(outW, outH, cmap, n);
@ -346,7 +361,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
for (int deviceId : { 0 })
{
auto fact = ConvFact::Create(deviceId);
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
auto eng = fact->CreatePoolEngine(deviceId);
auto inT = fact->CreateTensor(inW, inH, cmap, n);
auto outT = fact->CreateTensor(outW, outH, cmap, n);
@ -406,7 +421,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
for (int deviceId : { 0 })
{
auto fact = ConvFact::Create(deviceId);
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
auto eng = fact->CreatePoolEngine(deviceId);
auto inT = fact->CreateTensor(inW, inH, cmap, n);
auto outT = fact->CreateTensor(outW, outH, cmap, n);
@ -456,7 +471,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
for (int deviceId : { 0 })
{
auto fact = ConvFact::Create(deviceId);
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
auto eng = fact->CreatePoolEngine(deviceId);
auto inT = fact->CreateTensor(inW, inH, cmap, n);
auto outT = fact->CreateTensor(outW, outH, cmap, n);

Просмотреть файл

@ -535,6 +535,19 @@ namespace Microsoft
BOOST_CHECK(m1.IsEqualTo(m2));
}
#if 0 // Temporarily disabling
BOOST_FIXTURE_TEST_CASE(GPUMatrixLargeInequality, RandomSeedFixture)
{
const int rows = 33553921;
const int cols = 1;
auto m0 = GPUMatrix<float>::Zeros(rows, cols, c_deviceIdZero);
auto m1 = GPUMatrix<float>::Ones(rows, cols, c_deviceIdZero);
BOOST_CHECK(!m1.IsEqualTo(m0, c_epsilonFloatE5));
}
#endif
BOOST_AUTO_TEST_SUITE_END()
}
}

Просмотреть файл

@ -493,34 +493,22 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrix1DConvolutionRandomInit, RandomSeedFixtu
}
}
#if 0 // Temporarily disabling
BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrixLargeIsEqual, RandomSeedFixture)
{
const int rows = 33553921;
const int cols = 1;
Matrix<float> m0 = Matrix<float>::Zeros(rows, cols, c_deviceIdZero);
Matrix<float> m1 = Matrix<float>::Ones(rows, cols, c_deviceIdZero);
BOOST_CHECK(!m1.IsEqualTo(m0, c_epsilonFloatE5));
}
BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrix1DConvolutionBackprop, RandomSeedFixture)
{
const int inChannels = 2;// 50;
const int inWidth = 4;// 10;
const int inChannels = 50;
const int inWidth = 10;
const int inHeight = 1;
const int batchSize = 3;// 20;
const int kernelWidth = 2;// 3;
const int batchSize = 20;
const int kernelWidth = 3;
const int kernelHeight = inHeight;
const int horizontalSubsample = 1;
const int verticalSubsample = 1;
const bool zeroPadding = false;
const int outChannels = 2;// 3;
const int outWidth = zeroPadding ? inWidth : (inWidth >= kernelWidth ? 1 + (inWidth - kernelWidth) / horizontalSubsample : 0);
const int outChannels = 3;
const int outWidth = zeroPadding ? (inWidth / horizontalSubsample) : (inWidth >= kernelWidth ? 1 + (inWidth - kernelWidth) / horizontalSubsample : 0);
const int outHeight = inHeight;
const float randomInitLowerBound = 1.0f;
const float randomInitUpperBound = 5.0f;
const float randomInitLowerBound = -1.0f;
const float randomInitUpperBound = 1.0f;
Matrix<float> outputGradientSubBatch = Matrix<float>::RandomUniform(outChannels, batchSize*outWidth, randomInitLowerBound, randomInitUpperBound, IncrementCounter(), c_deviceIdZero);
Matrix<float> inputSubBatch = Matrix<float>::RandomUniform(inChannels*inWidth, batchSize, randomInitLowerBound, randomInitUpperBound, IncrementCounter(), c_deviceIdZero);
Matrix<float> tempMatrix(1, 1, c_deviceIdZero);
@ -550,30 +538,8 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrix1DConvolutionBackprop, RandomSeedFixture
Matrix<float>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, inputGradientValues2, batchSize, horizontalSubsample, zeroPadding, false);
inputGradientValues2.Reshape(outChannels, inChannels*kernelWidth);
const int dim = outChannels*inChannels*kernelWidth;
float* base = inputGradientValues1.CopyToArray();
float baseA[dim];
fprintf(stderr, "[BASE]");
for (int i = 0; i < dim; i++)
{
baseA[i] = base[i];
fprintf(stderr, "%f ", baseA[i]);
}
fprintf(stderr, "\n");
float* exp = inputGradientValues2.CopyToArray();
float expA[dim];
fprintf(stderr, "[EXP]");
for (int i = 0; i < dim; i++)
{
expA[i] = exp[i];
fprintf(stderr, "%f ", expA[i]);
}
fprintf(stderr, "\n");
BOOST_CHECK(inputGradientValues2.IsEqualTo(inputGradientValues1, c_epsilonFloatE5));
BOOST_CHECK(inputGradientValues2.IsEqualTo(inputGradientValues1, c_epsilonFloatE2));
}
#endif
BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrixReshape, RandomSeedFixture)
{
@ -595,10 +561,10 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrixReshape, RandomSeedFixture)
BOOST_CHECK(denseMatrixC.IsEqualTo(denseMatrixB, c_epsilonFloatE5));
BOOST_CHECK(!denseMatrixC.IsEqualTo(denseMatrixA, c_epsilonFloatE5));
}
#if 0
BOOST_FIXTURE_TEST_CASE(GPUSSparseTensorShuffleScaleAndAdd, RandomSeedFixture)
{
size_t D = 10, S = 10, M = 10, K = 10, T = 10;
size_t D = 13, S = 11, M = 7, K = 15, T = 8;
GPUMatrix<float> denseMatrixA = GPUMatrix<float>::RandomUniform(D * S * M * K, T, c_deviceIdZero, -1, 1, IncrementCounter());
GPUMatrix<float> denseMatrixB(D*S*M*K, T, c_deviceIdZero);
GPUMatrix<float> denseMatrixC(D*S*M*K, T, c_deviceIdZero);
@ -612,7 +578,7 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseTensorShuffleScaleAndAdd, RandomSeedFixture)
BOOST_CHECK(denseMatrixC.IsEqualTo(denseMatrixB, c_epsilonFloatE5));
}
#endif
BOOST_AUTO_TEST_SUITE_END()
} } } }