Merge branch 'master' into qiwye/multiverso
Conflicts: Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj Source/SGDLib/SGDLib.vcxproj Source/SGDLib/SGDLib.vcxproj.filters
This commit is contained in:
Коммит
6c2ee1aa51
1
Makefile
1
Makefile
|
@ -240,6 +240,7 @@ MATH_SRC =\
|
|||
ifdef CUDA_PATH
|
||||
MATH_SRC +=\
|
||||
$(SOURCEDIR)/Math/GPUMatrix.cu \
|
||||
$(SOURCEDIR)/Math/GPUTensor.cu \
|
||||
$(SOURCEDIR)/Math/GPUSparseMatrix.cu \
|
||||
$(SOURCEDIR)/Math/GPUWatcher.cu \
|
||||
$(SOURCEDIR)/Math/MatrixQuantizerGPU.cu \
|
||||
|
|
|
@ -35,27 +35,32 @@ using namespace std;
|
|||
;
|
||||
|
||||
wstring computationNodes = // TODO: use actual TypeName() here? would first need to make it a wide string; we should also extract those two methods into the base macro
|
||||
L"LearnableParameter(rows, cols, needGradient = true, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' /*plus the function args*/ ]\n"
|
||||
L"LearnableParameter(rows, cols, needGradient = true, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (rows : cols) ] /*plus the function args*/ ]\n"
|
||||
L"Parameter = LearnableParameter // deprecated \n"
|
||||
L"ParameterTensor(dims, needGradient = true, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
|
||||
// ^^ already works; vv untested
|
||||
L"Input(rows, cols, tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = false /*plus the function args*/ ]\n" // note: naming a little inconsistent // TODO: re-test after flag change
|
||||
L"SparseInput(rows, cols, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = false /*plus the function args*/ ]\n"
|
||||
L"ImageInput(imageWidth, imageHeight, imageChannels, numImages, tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]\n"
|
||||
L"SparseImageInput(imageWidth, imageHeight, imageChannels, numImages, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]\n"
|
||||
L"Input(dims, tag='feature') = new ComputationNode [ operation = 'InputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]\n" // note: naming a little inconsistent // TODO: re-test after flag change
|
||||
L"SparseInput(dims, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]\n"
|
||||
L"ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]\n"
|
||||
L"SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]\n"
|
||||
L"Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, needGradient = false, init = 'fixedValue', value = val) \n"
|
||||
L"PastValue(rows, cols, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input /*plus the function args*/ ]\n"
|
||||
L"FutureValue(rows, cols, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input /*plus the function args*/ ]\n"
|
||||
L"PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
|
||||
L"FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
|
||||
// TODO: ^^ DelayedValues no longer need to know their dimension. That is inferred in Validation.
|
||||
L"RowSlice(startIndex, numRows, input, needGradient = false, tag='') = new ComputationNode [ operation = 'RowSlice' ; inputs = input /*plus the function args*/ ]\n"
|
||||
L"RowRepeat(input, numRepeats, needGradient = false, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]\n"
|
||||
L"RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]\n"
|
||||
L"Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input /*plus the function args*/ ]\n"
|
||||
L"Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'DeprecatedReshape' ; inputs = input /*plus the function args*/ ]\n"
|
||||
L"NewReshape(input, dims, beginDim=0, endDim=0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
|
||||
L"ReshapeDimension(x, dim, tensorShape) = NewReshape(x, tensorShape, beginDim=dim, endDim=dim + 1) \n"
|
||||
L"FlattenDimensions(x, dim, num) = NewReshape(x, 0, beginDim=dim, endDim=dim + num) \n"
|
||||
L"SplitDimension(x, dim, N) = ReshapeDimension(x, dim, 0:N) \n"
|
||||
L"Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability) /*plus the function args*/ ]\n"
|
||||
L"WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]\n"
|
||||
L"ReconcileMBLayout(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileMBLayout' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]\n"
|
||||
L"Convolution(weightNode, inputValueNode, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode) /*plus the function args*/ ]\n"
|
||||
L"MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]\n"
|
||||
L"AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, tag='') = new ComputationNode [ operation = 'AveragePoolingNode' ; inputs = input /*plus the function args*/ ]\n"
|
||||
L"Convolution(weightNode, inputValueNode, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode) /*plus the function args*/ ]\n"
|
||||
L"MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]\n"
|
||||
L"AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'AveragePooling' ; inputs = input /*plus the function args*/ ]\n"
|
||||
// TODO: define DelayedValue, with negative delay for future; cannot do this yet, need to be able to say something like delay = -(^.delay)
|
||||
// aliases
|
||||
L"ColumnwiseCrossProduct = KhatriRaoProduct // deprecated \n" // TODO: should it be deprecated? It is described as easier to understand in the CNTKBook.
|
||||
|
|
|
@ -903,12 +903,12 @@ void DoTrain(const ConfigRecordType & config)
|
|||
};
|
||||
}
|
||||
// legacy test mode for BrainScript. Will go away once we fully integrate with BS.
|
||||
else if (config.Exists(L"ExperimentalNetworkBuilder"))
|
||||
else if (config.Exists(L"BrainScriptNetworkBuilder") || config.Exists(L"ExperimentalNetworkBuilder"/*legacy*/))
|
||||
{
|
||||
// We interface with outer old CNTK config by taking the inner part, which we get as a string, as BrainScript.
|
||||
// We prepend a few standard definitions, and also definition of deviceId and precision, which all objects will pull out again when they are being constructed.
|
||||
// BUGBUG: We are not getting TextLocations right in this way! Do we need to inject location markers into the source? Moot once we fully switch to BS
|
||||
wstring sourceCode = config(L"ExperimentalNetworkBuilder");
|
||||
wstring sourceCode = config.Exists(L"BrainScriptNetworkBuilder") ? config(L"BrainScriptNetworkBuilder") : config(L"ExperimentalNetworkBuilder");
|
||||
let expr = BS::ParseConfigDictFromString(standardFunctions + computationNodes + commonMacros
|
||||
+ msra::strfun::wstrprintf(L"deviceId = %d ; precision = '%ls' ; network = new ComputationNetwork ", (int)deviceId, ElemTypeName<ElemType>()) // TODO: check if typeid needs postprocessing
|
||||
+ sourceCode, vector<wstring>()); // source code has the form [ ... ]
|
||||
|
|
|
@ -158,7 +158,7 @@
|
|||
<ClInclude Include="..\Common\Include\Basics.h" />
|
||||
<ClInclude Include="..\Common\Include\BestGpu.h" />
|
||||
<ClInclude Include="..\Common\Include\DataReader.h" />
|
||||
<ClInclude Include="..\Common\Include\DataTensor.h" />
|
||||
<ClInclude Include="..\Common\Include\TensorShape.h" />
|
||||
<ClInclude Include="..\Common\Include\DataWriter.h" />
|
||||
<ClInclude Include="..\Common\Include\File.h" />
|
||||
<ClInclude Include="..\Common\Include\fileutil.h" />
|
||||
|
|
|
@ -133,7 +133,7 @@
|
|||
<ClInclude Include="..\Common\Include\Sequences.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\DataTensor.h">
|
||||
<ClInclude Include="..\Common\Include\TensorShape.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\ProgressTracing.h">
|
||||
|
|
|
@ -154,6 +154,8 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
|
|||
ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(LearnableParameter), L"Parameter"))
|
||||
ret = true;
|
||||
else if (EqualInsensitive(nodeType, L"ImageParameter"))
|
||||
ret = true;
|
||||
//else if (EqualInsensitive(nodeType, OperationNameOf(SparseLearnableParameter), L"SparseParameter"))
|
||||
// ret = true;
|
||||
else if (EqualInsensitive(nodeType, L"Constant", L"Const"))
|
||||
|
|
|
@ -30,29 +30,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
case SIMPLENET:
|
||||
net = BuildSimpleDNN(); break;
|
||||
case SIMPLERNN:
|
||||
net = BuildSimpleRNN(1); break;
|
||||
net = BuildSimpleRNN(); break;
|
||||
case LSTM:
|
||||
net = BuildLSTMNetworkFromDescription(1); break;
|
||||
net = BuildLSTMNetworkFromDescription(); break;
|
||||
case CLASSLSTM:
|
||||
net = BuildCLASSLSTMNetworkFromDescription(1); break;
|
||||
net = BuildCLASSLSTMNetworkFromDescription(); break;
|
||||
case NCELSTM:
|
||||
net = BuildNCELSTMNetworkFromDescription(1); break;
|
||||
net = BuildNCELSTMNetworkFromDescription(); break;
|
||||
case CLASSLM:
|
||||
net = BuildClassEntropyNetwork(1); break;
|
||||
net = BuildClassEntropyNetwork(); break;
|
||||
case LBLM:
|
||||
net = BuildLogBilinearNetworkFromDescription(1); break;
|
||||
net = BuildLogBilinearNetworkFromDescription(); break;
|
||||
case NPLM:
|
||||
net = BuildNeuralProbNetworkFromDescription(1); break;
|
||||
net = BuildNeuralProbNetworkFromDescription(); break;
|
||||
case CLSTM:
|
||||
net = BuildConditionalLSTMNetworkFromDescription(1); break;
|
||||
net = BuildConditionalLSTMNetworkFromDescription(); break;
|
||||
case RCRF:
|
||||
net = BuildSeqTrnLSTMNetworkFromDescription(1); break;
|
||||
net = BuildSeqTrnLSTMNetworkFromDescription(); break;
|
||||
case LSTMENCODER:
|
||||
net = BuildLSTMEncoderNetworkFromDescription(1); break;
|
||||
net = BuildLSTMEncoderNetworkFromDescription(); break;
|
||||
case UNIDIRECTIONALLSTM:
|
||||
net = BuildUnidirectionalLSTMNetworksFromDescription(1); break;
|
||||
net = BuildUnidirectionalLSTMNetworksFromDescription(); break;
|
||||
case BIDIRECTIONALLSTM:
|
||||
net = BuildBiDirectionalLSTMNetworksFromDescription(1); break;
|
||||
net = BuildBiDirectionalLSTMNetworksFromDescription(); break;
|
||||
default:
|
||||
LogicError("BuildNetworkFromDescription: invalid m_rnnType %d", (int)m_rnnType);
|
||||
}
|
||||
|
@ -75,11 +75,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
switch (m_rnnType)
|
||||
{
|
||||
case ALIGNMENTSIMILARITYGENERATOR:
|
||||
net = BuildAlignmentDecoderNetworkFromDescription(encoderNet, 1);
|
||||
net = BuildAlignmentDecoderNetworkFromDescription(encoderNet);
|
||||
net->CompileNetwork();
|
||||
return net;
|
||||
case ALIGNMENTSIMILARITYGFORWARDDECODER:
|
||||
net = BuildAlignmentForwardDecoderNetworkFromDescription(encoderNet, 1);
|
||||
net = BuildAlignmentForwardDecoderNetworkFromDescription(encoderNet);
|
||||
net->CompileNetwork();
|
||||
return net;
|
||||
}
|
||||
|
@ -95,12 +95,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
unsigned long randomSeed = 1;
|
||||
|
||||
size_t mbSize = 3; //this is not the actual minibatch size. only used in the validataion process
|
||||
|
||||
size_t numHiddenLayers = m_layerSizes.size() - 2;
|
||||
ComputationNodePtr input, w, b, output, label, prior, scaledLogLikelihood;
|
||||
|
||||
input = builder.Input(m_layerSizes[0], mbSize, L"features");
|
||||
input = builder.CreateInputNode(L"features", m_layerSizes[0]);
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
if (m_applyMeanVarNorm)
|
||||
|
@ -114,9 +112,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
if (numHiddenLayers > 0)
|
||||
{
|
||||
w = builder.Parameter(m_layerSizes[1], m_layerSizes[0], L"W0");
|
||||
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[0]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
b = builder.Parameter(m_layerSizes[1], 1, L"B0");
|
||||
b = builder.CreateLearnableParameter(L"B0", m_layerSizes[1], 1);
|
||||
output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, L"W0*features"), b, L"W0*features+B0"), 0, L"H1");
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
|
@ -133,9 +131,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
|
||||
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
|
||||
|
||||
w = builder.Parameter(m_layerSizes[i + 1], m_layerSizes[i], nameOfW);
|
||||
w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[i + 1], m_layerSizes[i]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
b = builder.Parameter(m_layerSizes[i + 1], 1, nameOfB);
|
||||
b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[i + 1], 1);
|
||||
output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, nameOfTimes), b, nameOfPlus), i, nameOfH);
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
|
@ -151,13 +149,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
wstring nameOfTimes = nameOfW + L"*" + nameOfPrevH;
|
||||
wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
|
||||
|
||||
w = builder.Parameter(m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers], nameOfW);
|
||||
w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
b = builder.Parameter(m_layerSizes[numHiddenLayers + 1], 1, nameOfB);
|
||||
b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[numHiddenLayers + 1], 1);
|
||||
output = builder.Plus(builder.Times(w, input, nameOfTimes), b, nameOfPlus);
|
||||
m_net->RenameNode(output, L"HLast");
|
||||
|
||||
label = builder.Input(m_layerSizes[numHiddenLayers + 1], mbSize, L"labels");
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
|
||||
|
||||
AddTrainAndEvalCriterionNodes(output, label);
|
||||
|
||||
|
@ -188,7 +186,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
// Note: while ComputationNode and CompuationNetwork are (supposed to be) independent of ElemType, it is OK to keep this class dependent.
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSimpleRNN(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSimpleRNN()
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -201,7 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
ComputationNodePtr input, w, b, u, pastValue, output, label, prior;
|
||||
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
if (m_applyMeanVarNorm)
|
||||
|
@ -225,7 +223,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], mbSize, 1);
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
|
||||
/// unless there is a good algorithm to detect loops, use this explicit setup
|
||||
output = ApplyNonlinearFunction(
|
||||
builder.Plus(
|
||||
|
@ -255,7 +253,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", i), m_layerSizes[i+1], m_layerSizes[i+1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], mbSize, 1);
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], 1);
|
||||
/// unless there is a good algorithm to detect loops, use this explicit setup
|
||||
output = ApplyNonlinearFunction(
|
||||
builder.Plus(
|
||||
|
@ -279,7 +277,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
/*m_net->MatrixL2Reg(w , L"L1w");*/
|
||||
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize);
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1]);
|
||||
AddTrainAndEvalCriterionNodes(input, label, w, L"criterion", L"eval");
|
||||
|
||||
output = builder.Times(w, input, L"outputs");
|
||||
|
@ -294,7 +292,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyNetwork(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyNetwork()
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
|
||||
|
@ -312,7 +310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (m_vocabSize != m_layerSizes[numHiddenLayers + 1])
|
||||
RuntimeError("BuildClassEntropyNetwork : vocabulary size should be the same as the output layer size");
|
||||
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
if (m_applyMeanVarNorm)
|
||||
|
@ -335,7 +333,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], mbSize, 1);
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
|
||||
/// unless there is a good algorithm to detect loops, use this explicit setup
|
||||
output = ApplyNonlinearFunction(
|
||||
builder.Plus(
|
||||
|
@ -364,7 +362,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", i), m_layerSizes[i+1], m_layerSizes[i+1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], mbSize, 1);
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], 1);
|
||||
/// unless there is a good algorithm to detect loops, use this explicit setup
|
||||
output = ApplyNonlinearFunction(
|
||||
builder.Plus(
|
||||
|
@ -391,7 +389,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
/// the label is a dense matrix. each element is the word index
|
||||
label = builder.CreateInputNode(L"labels", 4, mbSize);
|
||||
label = builder.CreateInputNode(L"labels", 4);
|
||||
|
||||
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
@ -412,7 +410,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetworkFromDescription(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetworkFromDescription()
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -428,7 +426,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr clslogpostprob;
|
||||
ComputationNodePtr clsweight;
|
||||
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
if (m_applyMeanVarNorm)
|
||||
|
@ -461,13 +459,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (numHiddenLayers > 0)
|
||||
{
|
||||
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
/// previously used function. now uses LSTMNode which is correct and fast
|
||||
input = output;
|
||||
for (int i = 1 + offset; i < numHiddenLayers; i++)
|
||||
{
|
||||
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
input = builder.Dropout(output);
|
||||
|
@ -477,7 +475,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
/// serve as a global bias term
|
||||
gt = builder.CreateInputNode(L"binaryFeature", m_auxFeatDim, 1);
|
||||
gt = builder.CreateInputNode(L"binaryFeature", m_auxFeatDim);
|
||||
m_net->FeatureNodes().push_back(gt);
|
||||
e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"AuxTrans%d", 0),
|
||||
m_layerSizes[numHiddenLayers], m_auxFeatDim);
|
||||
|
@ -493,7 +491,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
/// the label is a dense matrix. each element is the word index
|
||||
label = builder.CreateInputNode(L"labels", 4, mbSize);
|
||||
label = builder.CreateInputNode(L"labels", 4);
|
||||
|
||||
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
@ -518,7 +516,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
the aligment node takes a variable length input and relates each element to a variable length output
|
||||
*/
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet)
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -535,7 +533,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr clsweight;
|
||||
ComputationNodePtr columnStride, rowStride;
|
||||
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
if (m_lookupTableOrder > 0)
|
||||
|
@ -577,9 +575,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i], m_layerSizes[i]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], mbSize, 1);
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], 1);
|
||||
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
// output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
// output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
|
||||
/// alignment node to get weights from source to target
|
||||
/// this aligment node computes weights of the current hidden state after special encoder ending symbol to all
|
||||
|
@ -607,7 +605,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
for (; i < numHiddenLayers; i++)
|
||||
{
|
||||
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
input = builder.Dropout(output);
|
||||
|
@ -625,7 +623,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
/// the label is a dense matrix. each element is the word index
|
||||
label = builder.CreateInputNode(L"labels", 4, mbSize);
|
||||
label = builder.CreateInputNode(L"labels", 4);
|
||||
|
||||
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
@ -645,7 +643,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet)
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -662,7 +660,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr clsweight;
|
||||
ComputationNodePtr columnStride, rowStride;
|
||||
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
if (m_lookupTableOrder > 0)
|
||||
|
@ -704,9 +702,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i], m_layerSizes[i]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], mbSize, 1);
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], 1);
|
||||
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
// output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
// output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
|
||||
/// alignment node to get weights from source to target
|
||||
/// this aligment node computes weights of the current hidden state after special encoder ending symbol to all
|
||||
|
@ -734,7 +732,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
for (; i < numHiddenLayers; i++)
|
||||
{
|
||||
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
input = builder.Dropout(output);
|
||||
|
@ -752,7 +750,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
/// the label is a dense matrix. each element is the word index
|
||||
label = builder.CreateInputNode(L"labels", 4, mbSize);
|
||||
label = builder.CreateInputNode(L"labels", 4);
|
||||
|
||||
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
@ -775,7 +773,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFromDescription(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFromDescription()
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -793,8 +791,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr ot=nullptr, it=nullptr, ft=nullptr, gt=nullptr, ct=nullptr, ht=nullptr;
|
||||
ComputationNodePtr pastValueXI, pastValueXII, pastValueXIII, pastValueXIV;
|
||||
|
||||
// input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
// input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
|
||||
input = builder.CreateInputNode(L"features", m_layerSizes[0]);
|
||||
featin = input;
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
|
@ -827,7 +825,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
while (ik <= m_maOrder)
|
||||
{
|
||||
pastValueXI =
|
||||
builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, ik, msra::strfun::wstrprintf(L"pastValue%d", ik));
|
||||
builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], ik, msra::strfun::wstrprintf(L"pastValue%d", ik));
|
||||
pastValueXI->SetParameterUpdateRequired(false);
|
||||
pastValueXI->AttachInputs(input);
|
||||
//TODO: to figure out sparse matrix size
|
||||
|
@ -855,7 +853,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"R%d", i+1), m_layerSizes[i+1], m_layerSizes[i+1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], mbSize, 1);
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], 1);
|
||||
output = builder.Plus(builder.Times(w, pastValue), input);
|
||||
|
||||
pastValue->AttachInputs(output);
|
||||
|
@ -875,7 +873,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize);
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1]);
|
||||
AddTrainAndEvalCriterionNodes(input, label, w);
|
||||
|
||||
output = builder.Times(w, input, L"outputs");
|
||||
|
@ -892,7 +890,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNeuralProbNetworkFromDescription(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNeuralProbNetworkFromDescription()
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -910,7 +908,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr ot = nullptr, it = nullptr, ft = nullptr, gt = nullptr, ct = nullptr, ht = nullptr;
|
||||
ComputationNodePtr pastValueXI, pastValueXII, pastValueXIII, pastValueXIV;
|
||||
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
if (m_applyMeanVarNorm)
|
||||
|
@ -927,10 +925,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
bi = builder.CreateLearnableParameter(L"bi0", m_layerSizes[1], 1);
|
||||
|
||||
pastValueXI = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 1);
|
||||
pastValueXII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 2);
|
||||
pastValueXIII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 3);
|
||||
pastValueXIV = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 4);
|
||||
pastValueXI = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 1);
|
||||
pastValueXII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 2);
|
||||
pastValueXIII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 3);
|
||||
pastValueXIV = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 4);
|
||||
pastValueXI->AttachInputs(input);
|
||||
pastValueXII->AttachInputs(input);
|
||||
pastValueXIII->AttachInputs(input);
|
||||
|
@ -996,7 +994,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", i), m_layerSizes[i+1], m_layerSizes[i+1]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
std::list<ComputationNodeBasePtr> recurrent_loop;
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], mbSize, 1);
|
||||
pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], 1);
|
||||
output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), builder.Times(w, pastValue)), i);
|
||||
pastValue->AttachInputs(output);
|
||||
recur_idx++;
|
||||
|
@ -1017,7 +1015,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
// b = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"B%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], 1);
|
||||
label = builder.CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize);
|
||||
label = builder.CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers+1]);
|
||||
AddTrainAndEvalCriterionNodes(input, label, w);
|
||||
|
||||
output = builder.Times(w, input);
|
||||
|
@ -1034,7 +1032,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildDirectConnect(unsigned long &randomSeed, size_t /*mbSize*/, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode)
|
||||
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildDirectConnect(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode)
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
|
||||
|
@ -1050,7 +1048,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
ComputationNodePtr scalar = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"SV%d", i), 1, 1);
|
||||
scalar->Value().SetValue((ElemType)0.01);
|
||||
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
|
||||
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
|
||||
ComputationNodePtr scaled = builder.Scale(scalar, directOutput, msra::strfun::wstrprintf(L"S%d", i));
|
||||
#else
|
||||
ComputationNodePtr scaled = builder.ElementTimes(scalar, directOutput, msra::strfun::wstrprintf(L"S%d", i));
|
||||
|
@ -1065,7 +1063,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
|
||||
template<class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs)
|
||||
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponent(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs)
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
|
||||
|
@ -1121,17 +1119,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
size_t layer1 = outputDim;
|
||||
|
||||
pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
|
||||
if(m_constInputGateValue)
|
||||
{
|
||||
//it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim, mbSize);
|
||||
//it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim);
|
||||
//it->SetParameterUpdateRequired(false);
|
||||
//it->Value().SetValue(m_constInputGateValue);
|
||||
it = nullptr;
|
||||
|
@ -1241,7 +1239,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSeqTrnLSTMNetworkFromDescription(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSeqTrnLSTMNetworkFromDescription()
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -1261,7 +1259,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = { nullptr };
|
||||
ComputationNodePtr trans;
|
||||
|
||||
input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateInputNode(L"features", m_layerSizes[0]);
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
if (m_applyMeanVarNorm)
|
||||
|
@ -1297,7 +1295,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i+1)
|
||||
{
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i] * (offset ? m_lookupTableOrder : 1), m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i] * (offset ? m_lookupTableOrder : 1), m_layerSizes[i + 1], input);
|
||||
input = output;
|
||||
|
||||
recur_idx++;
|
||||
|
@ -1326,7 +1324,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
trans->Value().SetValue((ElemType)1.0 / m_layerSizes[numHiddenLayers + 1]);
|
||||
// m_net->InitLearnableParameters(trans, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
trans->SetParameterUpdateRequired(true);
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
|
||||
AddTrainAndEvalCriterionNodes(output, label, nullptr, L"CRFTrainCriterion", L"CRFEvalCriterion", nullptr, trans);
|
||||
|
||||
input = output;
|
||||
|
@ -1340,7 +1338,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCLASSLSTMNetworkFromDescription(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCLASSLSTMNetworkFromDescription()
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -1356,7 +1354,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr clslogpostprob;
|
||||
ComputationNodePtr clsweight;
|
||||
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
if (m_applyMeanVarNorm)
|
||||
|
@ -1389,13 +1387,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (numHiddenLayers > 0)
|
||||
{
|
||||
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
/// previously used function. now uses LSTMNode which is correct and fast
|
||||
input = output;
|
||||
for (int i = 1 + offset; i <numHiddenLayers; i++)
|
||||
{
|
||||
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
input = builder.Dropout(output);
|
||||
|
@ -1411,7 +1409,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
/// the label is a dense matrix. each element is the word index
|
||||
label = builder.CreateInputNode(L"labels", 4, mbSize);
|
||||
label = builder.CreateInputNode(L"labels", 4);
|
||||
|
||||
clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
|
||||
m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
@ -1482,7 +1480,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
#endif
|
||||
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescription(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescription()
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -1502,9 +1500,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = { nullptr };
|
||||
|
||||
if (m_sparse_input)
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
|
||||
else
|
||||
input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateInputNode(L"features", m_layerSizes[0]);
|
||||
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
|
@ -1542,7 +1540,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
|
||||
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
/// previously used function. now uses LSTMNode which is correct and fast
|
||||
input = output;
|
||||
outputFromEachLayer[offset + 1] = input;
|
||||
|
@ -1553,7 +1551,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
|
||||
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
// previously used function, now uses LSTMnode, which is fast and correct
|
||||
|
||||
recur_idx++;
|
||||
|
@ -1580,7 +1578,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
#ifdef DEBUG_DECODER
|
||||
w->Value().SetValue((ElemType)0.01);
|
||||
#endif
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
|
||||
AddTrainAndEvalCriterionNodes(input, label, w);
|
||||
|
||||
output = builder.Times(w, input, L"outputs");
|
||||
|
@ -1615,7 +1613,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
K. Yao, G. Zweig, "Sequence-to-sequence neural net models for grapheme-to-phoneme conversion, submitted to Interspeech 2015
|
||||
*/
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMEncoderNetworkFromDescription(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMEncoderNetworkFromDescription()
|
||||
{
|
||||
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
|
@ -1631,9 +1629,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr input, w, b, u, e, pastValue, output, label, prior;
|
||||
|
||||
if (m_sparse_input)
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
|
||||
else
|
||||
input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateInputNode(L"features", m_layerSizes[0]);
|
||||
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
|
@ -1669,14 +1667,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (numHiddenLayers > 0)
|
||||
{
|
||||
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
input = output;
|
||||
i++;
|
||||
|
||||
for (; i<numHiddenLayers; i++)
|
||||
{
|
||||
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
|
||||
if (m_addDropoutNodes)
|
||||
input = builder.Dropout(output);
|
||||
|
@ -1705,7 +1703,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
K. Yao, G. Zweig, "Sequence-to-sequence neural net models for grapheme-to-phoneme conversion" submitted to Interspeech 2015
|
||||
*/
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildUnidirectionalLSTMNetworksFromDescription(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildUnidirectionalLSTMNetworksFromDescription()
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -1726,11 +1724,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
map<wstring, size_t> featDim;
|
||||
|
||||
assert(m_streamSizes.size() > 0);
|
||||
inputbackward = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0], mbSize);
|
||||
inputbackward = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0]);
|
||||
m_net->FeatureNodes().push_back(inputbackward);
|
||||
featDim[L"featurepastValueedTarget"] = m_streamSizes[0];
|
||||
|
||||
inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1], mbSize);
|
||||
inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1]);
|
||||
m_net->FeatureNodes().push_back(inputletter);
|
||||
featDim[L"ltrForward"] = m_streamSizes[1];
|
||||
|
||||
|
@ -1777,7 +1775,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
switch (m_rnnType){
|
||||
case UNIDIRECTIONALLSTM:
|
||||
//output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx, dims, m_layerSizes[layerIdx + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx, dims, m_layerSizes[layerIdx + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx, dims, m_layerSizes[layerIdx + 1], input);
|
||||
break;
|
||||
default:
|
||||
LogicError("This is for unidorectional LSTM model. Check rnntype to see whether it is UNIDIRECTIONALLSTMWITHPASTPREDICTION or TRANSDUCER");
|
||||
|
@ -1797,7 +1795,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
input = output;
|
||||
|
||||
/// here uses "labels", so only one label from multiple stream inputs are used.
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
|
||||
|
||||
AddTrainAndEvalCriterionNodes(input, label, w);
|
||||
|
||||
|
@ -1819,7 +1817,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t mbSize, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse)
|
||||
shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse)
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
|
||||
|
@ -1896,17 +1894,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
size_t layer1 = outputDim;
|
||||
|
||||
pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
|
||||
pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
|
||||
|
||||
if (m_constInputGateValue)
|
||||
{
|
||||
//it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim, mbSize);
|
||||
//it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim);
|
||||
//it->SetParameterUpdateRequired(false);
|
||||
//it->Value().SetValue(m_constInputGateValue);
|
||||
it = nullptr;
|
||||
|
@ -2026,7 +2024,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
K. Yao, G. Zweig, "Sequence-to-sequence neural net models for grapheme-to-phoneme conversion, submitted to Interspeech 2015
|
||||
*/
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildBiDirectionalLSTMNetworksFromDescription(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildBiDirectionalLSTMNetworksFromDescription()
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -2049,10 +2047,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
size_t ltrSrcIdx = 1;
|
||||
/// create projections to use pastValue predictions
|
||||
inputprediction = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0], mbSize);
|
||||
inputprediction = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0]);
|
||||
m_net->FeatureNodes().push_back(inputprediction);
|
||||
|
||||
inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1], mbSize);
|
||||
inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1]);
|
||||
m_net->FeatureNodes().push_back(inputletter);
|
||||
featDim[L"ltrForward"] = m_streamSizes[1];
|
||||
|
||||
|
@ -2100,12 +2098,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
/// forward direction
|
||||
//forwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 100, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
|
||||
forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 100, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
|
||||
forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 100, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
|
||||
forwardInput = forwardOutput;
|
||||
|
||||
backwardInput = (ComputationNodePtr)builder.TimeReverse(ltrSource);
|
||||
//backwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 200, ltrDim, m_layerSizes[layerIdx + 1], backwardInput);
|
||||
backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 200, ltrDim, m_layerSizes[layerIdx + 1], backwardInput);
|
||||
backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 200, ltrDim, m_layerSizes[layerIdx + 1], backwardInput);
|
||||
backwardInput = backwardOutput;
|
||||
|
||||
layerIdx++;
|
||||
|
@ -2113,11 +2111,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
while (layerIdx < numHiddenLayers - 1)
|
||||
{
|
||||
//forwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 100, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], forwardInput);
|
||||
forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 100, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], forwardInput);
|
||||
forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 100, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], forwardInput);
|
||||
forwardInput = forwardOutput;
|
||||
|
||||
//backwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 200, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], backwardInput);
|
||||
backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 200, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], backwardInput);
|
||||
backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 200, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], backwardInput);
|
||||
backwardInput = backwardOutput;
|
||||
|
||||
layerIdx++;
|
||||
|
@ -2137,7 +2135,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
forwardInput = (ComputationNodePtr)builder.Parallel(streams[0], streams[1], L"Parallel1");
|
||||
|
||||
// output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
|
||||
|
||||
input = output;
|
||||
layerIdx++;
|
||||
|
@ -2150,7 +2148,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
input = output;
|
||||
|
||||
/// here uses "labels", so only one label from multiple stream inputs are used.
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
|
||||
label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
|
||||
|
||||
AddTrainAndEvalCriterionNodes(input, label);
|
||||
|
||||
|
@ -2174,7 +2172,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType>
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDescription(size_t mbSize)
|
||||
ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDescription()
|
||||
{
|
||||
ComputationNetworkBuilder<ElemType> builder(*m_net);
|
||||
if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
|
||||
|
@ -2190,7 +2188,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr bias;
|
||||
ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = { nullptr };
|
||||
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
|
||||
input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
if (m_applyMeanVarNorm)
|
||||
|
@ -2222,7 +2220,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
int offset = m_lookupTableOrder > 0 ? 1 : 0;
|
||||
if (numHiddenLayers > 0)
|
||||
{
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
|
||||
input = output;
|
||||
outputFromEachLayer[offset + 1] = input;
|
||||
|
||||
|
@ -2230,7 +2228,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i)
|
||||
{
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
|
||||
|
||||
recur_idx++;
|
||||
}
|
||||
|
@ -2254,7 +2252,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
for (size_t i = offset; i < m_layerSizes.size(); i++)
|
||||
{
|
||||
/// add direct connect from each layers' output to the layer before the output layer
|
||||
output = BuildDirectConnect(randomSeed, mbSize, i, (i > 1) ? m_layerSizes[i] : ((offset == 0) ? m_layerSizes[i] : m_layerSizes[i] * m_lookupTableOrder), m_layerSizes[numHiddenLayers], outputFromEachLayer[i], input);
|
||||
output = BuildDirectConnect(randomSeed, i, (i > 1) ? m_layerSizes[i] : ((offset == 0) ? m_layerSizes[i] : m_layerSizes[i] * m_lookupTableOrder), m_layerSizes[numHiddenLayers], outputFromEachLayer[i], input);
|
||||
if (output != nullptr)
|
||||
input = output;
|
||||
}
|
||||
|
@ -2266,7 +2264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
|
||||
/// the label is a dense matrix. each element is the word index
|
||||
label = builder.CreateInputNode(L"labels", 2 * (this->nce_noises + 1), mbSize);
|
||||
label = builder.CreateInputNode(L"labels", 2 * (this->nce_noises + 1));
|
||||
|
||||
bias = builder.CreateLearnableParameter(L"BiasVector", 1, m_layerSizes[m_layerSizes.size() - 1]);
|
||||
bias->Value().SetValue((ElemType)-std::log(m_layerSizes[m_layerSizes.size() - 1]));
|
||||
|
@ -2301,7 +2299,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
ComputationNodePtr input, w, b, output, label, prior, scaledLogLikelihood;
|
||||
shared_ptr<PreComputedNode<ElemType>> pcNodePtr;
|
||||
size_t mbSize = 3; //this is not the actual minibatch size. only used in the validataion process
|
||||
|
||||
File fstream(dbnModelFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
|
||||
|
||||
|
@ -2336,7 +2333,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Matrix<ElemType> A = ReadMatrixFromDbnFile(fstream, std::string("b"));
|
||||
if (i == 0)
|
||||
{
|
||||
input = builder.Input(wts.GetNumCols(), mbSize, L"features");
|
||||
input = builder.CreateInputNode(L"features", wts.GetNumCols());
|
||||
m_net->FeatureNodes().push_back(input);
|
||||
|
||||
size_t frameDim = globalMean.GetNumRows();
|
||||
|
@ -2381,10 +2378,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
|
||||
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
|
||||
|
||||
w = builder.Parameter(wts.GetNumRows(), wts.GetNumCols(), nameOfW);
|
||||
w = builder.CreateLearnableParameter(nameOfW, wts.GetNumRows(), wts.GetNumCols());
|
||||
w->Value().SetValue(wts);
|
||||
|
||||
b = builder.Parameter(bias.GetNumRows(), 1, nameOfB);
|
||||
b = builder.CreateLearnableParameter(nameOfB, bias.GetNumRows(), 1);
|
||||
b->Value().SetValue(bias);
|
||||
|
||||
if (layerType == "perceptron")
|
||||
|
@ -2412,7 +2409,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
RuntimeError("Error reading DBN file - did not find expected tag ENET\n");
|
||||
//size_t outputLayerSize = m_layerSizes[m_layerSizes.size()-1];
|
||||
|
||||
label = builder.Input(m_outputLayerSize, mbSize, L"labels");
|
||||
label = builder.CreateInputNode(L"labels", m_outputLayerSize);
|
||||
|
||||
if (layerType == "perceptron") // complete network
|
||||
{
|
||||
|
@ -2446,9 +2443,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
|
||||
wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);
|
||||
|
||||
w = builder.Parameter(outputLayerSize, penultimateSize, nameOfW);
|
||||
w = builder.CreateLearnableParameter(nameOfW, outputLayerSize, penultimateSize);
|
||||
m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
|
||||
b = builder.Parameter(outputLayerSize, 1, nameOfB);
|
||||
b = builder.CreateLearnableParameter(nameOfB, outputLayerSize, 1);
|
||||
output = builder.Plus(builder.Times(w, input, nameOfTimes), b, nameOfPlus);
|
||||
m_net->RenameNode(output, L"HLast");
|
||||
|
||||
|
|
|
@ -256,41 +256,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
ComputationNetworkPtr BuildSimpleDNN();
|
||||
|
||||
ComputationNetworkPtr BuildSimpleRNN(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildSimpleRNN();
|
||||
|
||||
ComputationNetworkPtr BuildClassEntropyNetwork(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildClassEntropyNetwork();
|
||||
|
||||
ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);
|
||||
ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);
|
||||
|
||||
ComputationNodePtr BuildLSTMNodeComponent(ULONG &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);
|
||||
|
||||
ComputationNodePtr BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t mbSize, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse = false);
|
||||
ComputationNodePtr BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse = false);
|
||||
|
||||
ComputationNodePtr BuildDirectConnect(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode);
|
||||
ComputationNodePtr BuildDirectConnect(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode);
|
||||
|
||||
ComputationNetworkPtr BuildLogBilinearNetworkFromDescription(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildLogBilinearNetworkFromDescription();
|
||||
|
||||
ComputationNetworkPtr BuildNeuralProbNetworkFromDescription(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildNeuralProbNetworkFromDescription();
|
||||
|
||||
ComputationNetworkPtr BuildLSTMNetworkFromDescription(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildLSTMNetworkFromDescription();
|
||||
|
||||
ComputationNetworkPtr BuildSeqTrnLSTMNetworkFromDescription(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildSeqTrnLSTMNetworkFromDescription();
|
||||
|
||||
ComputationNetworkPtr BuildLSTMEncoderNetworkFromDescription(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildLSTMEncoderNetworkFromDescription();
|
||||
|
||||
ComputationNetworkPtr BuildUnidirectionalLSTMNetworksFromDescription(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildUnidirectionalLSTMNetworksFromDescription();
|
||||
|
||||
ComputationNetworkPtr BuildBiDirectionalLSTMNetworksFromDescription(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildBiDirectionalLSTMNetworksFromDescription();
|
||||
|
||||
ComputationNetworkPtr BuildCLASSLSTMNetworkFromDescription(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildCLASSLSTMNetworkFromDescription();
|
||||
|
||||
ComputationNetworkPtr BuildConditionalLSTMNetworkFromDescription(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildConditionalLSTMNetworkFromDescription();
|
||||
|
||||
ComputationNetworkPtr BuildNCELSTMNetworkFromDescription(size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildNCELSTMNetworkFromDescription();
|
||||
|
||||
ComputationNetworkPtr BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet);
|
||||
|
||||
ComputationNetworkPtr BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize = 1);
|
||||
ComputationNetworkPtr BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet);
|
||||
|
||||
//layer is 0 based
|
||||
ComputationNodePtr ApplyNonlinearFunction(ComputationNodePtr input, const size_t layer, const std::wstring nodeName = L"");
|
||||
|
|
|
@ -15,9 +15,12 @@
|
|||
#include "ConvolutionalNodes.h"
|
||||
#include "NonlinearityNodes.h"
|
||||
#include "ReshapingNodes.h"
|
||||
#include "TensorShape.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
using namespace std;
|
||||
|
||||
template<class ElemType>
|
||||
void SynchronousNodeEvaluator<ElemType>::Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass)
|
||||
{
|
||||
|
@ -58,48 +61,34 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
}
|
||||
|
||||
if (OperationNameOf(InputValue) == cnNodeType)
|
||||
if (OperationNameOf(InputValue) == cnNodeType || OperationNameOf(SparseInputValue) == cnNodeType)
|
||||
{
|
||||
if (parameter.size() < 1 || parameter.size() > 2)
|
||||
RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
|
||||
bool isSparse = (OperationNameOf(SparseInputValue) == cnNodeType);
|
||||
if (parameter.size() < 1)
|
||||
RuntimeError("%ls should have 1 or more parameters (tensor dimensions, e.g. [vecdim] or [rows, cols]).", cnNodeType.c_str());
|
||||
|
||||
if (pass == ndlPassInitial)
|
||||
{
|
||||
// evaluate only scalar parameters
|
||||
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
|
||||
size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
|
||||
size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
|
||||
size_t i = 0;
|
||||
auto tensorShape = ProcessTensorShapeParameters(node, params, i, /*isImage=*/false, cnNodeType);
|
||||
|
||||
// first look for this node already existing in the network
|
||||
// BUGBUG: How does this set the dimensions then?
|
||||
if (m_net->NodeNameExists(name))
|
||||
nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
|
||||
else if (isSparse)
|
||||
nodePtr = builder.CreateSparseInputNode(name, tensorShape);
|
||||
else
|
||||
nodePtr = builder.CreateInputNode(name, rows, cols);
|
||||
nodePtr = builder.CreateInputNode (name, tensorShape);
|
||||
}
|
||||
}
|
||||
else if (OperationNameOf(SparseInputValue) == cnNodeType)
|
||||
else if (cnNodeType == L"ImageInput" || cnNodeType == L"SparseImageInput")
|
||||
{
|
||||
if (parameter.size() < 1 || parameter.size() > 2)
|
||||
RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
|
||||
|
||||
if (pass == ndlPassInitial)
|
||||
{
|
||||
// evaluate only scalar parameters
|
||||
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
|
||||
size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
|
||||
size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
|
||||
|
||||
// first look for this node already existing in the network
|
||||
if (m_net->NodeNameExists(name))
|
||||
nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
|
||||
else
|
||||
nodePtr = builder.CreateSparseInputNode(name, rows, cols);
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == L"ImageInput")
|
||||
{
|
||||
if (parameter.size() < 3 || parameter.size() > 4)
|
||||
RuntimeError("%ls should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
|
||||
bool isSparse = (cnNodeType == L"SparseImageInput");
|
||||
if (parameter.size() < 3 || parameter.size() > 4) // we allow 4 for legacy (numImages, was ignored)
|
||||
RuntimeError("%ls should have 3 parameters[imageWidth, imageHeight, imageChannels].", cnNodeType.c_str());
|
||||
|
||||
if (pass == ndlPassInitial)
|
||||
{
|
||||
|
@ -108,44 +97,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
|
||||
size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
|
||||
size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
|
||||
size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
|
||||
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
|
||||
|
||||
nodePtr = builder.CreateInputNode(name, ImageLayoutWHC(imageWidth, imageHeight, imageChannels), numImages);
|
||||
if (isSparse)
|
||||
nodePtr = builder.CreateSparseInputNode(name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind));
|
||||
else
|
||||
nodePtr = builder.CreateInputNode (name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind));
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == L"SparseImageInput")
|
||||
else if (OperationNameOf(LearnableParameter) == cnNodeType || cnNodeType == L"ImageParameter")
|
||||
{
|
||||
if (parameter.size() < 3 || parameter.size() > 4)
|
||||
RuntimeError("%ls should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
|
||||
bool isImage = (cnNodeType == L"ImageParameter");
|
||||
if (!isImage)
|
||||
{
|
||||
if (parameter.size() < 1)
|
||||
RuntimeError("%ls should have 1 or more parameters (tensor dimensions, e.g. [vecdim] or [rows, cols]) plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
if (parameter.size() < 3)
|
||||
RuntimeError("%ls should have 3 parameters [imageWidth, imageHeight, imageChannels] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
|
||||
}
|
||||
|
||||
if (pass == ndlPassInitial)
|
||||
{
|
||||
// evaluate only scalar parameters
|
||||
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
|
||||
size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
|
||||
size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
|
||||
size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
|
||||
size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
|
||||
|
||||
nodePtr = builder.CreateSparseInputNode(name, ImageLayoutWHC(imageWidth, imageHeight, imageChannels), numImages);
|
||||
}
|
||||
}
|
||||
else if (OperationNameOf(LearnableParameter) == cnNodeType)
|
||||
{
|
||||
if (parameter.size() < 1 || parameter.size() > 2)
|
||||
RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
|
||||
|
||||
if (pass == ndlPassInitial)
|
||||
{
|
||||
// evaluate only scalar parameters
|
||||
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
|
||||
size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
|
||||
size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
|
||||
|
||||
size_t i = 0;
|
||||
auto tensorShape = ProcessTensorShapeParameters(node, params, i, isImage, cnNodeType);
|
||||
if (isImage)
|
||||
tensorShape.AppendInPlace(3, 1); // this goes into the column dimension
|
||||
bool needGradient = node->GetOptionalParameter("needGradient", "true");
|
||||
|
||||
nodePtr = builder.CreateLearnableParameter(name, rows, cols);
|
||||
|
||||
nodePtr = builder.CreateLearnableParameter(name, tensorShape);
|
||||
nodePtr->SetParameterUpdateRequired(needGradient);
|
||||
}
|
||||
else if (pass == ndlPassFinal)
|
||||
|
@ -305,7 +289,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
nodePtr->SetParameterUpdateRequired(needGradient);
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == OperationNameOf(ReshapeNode))
|
||||
else if (cnNodeType == L"Reshape"/*OperationNameOf(ReshapeNode)*/)
|
||||
{
|
||||
if (parameter.size() < 2 || parameter.size() > 5)
|
||||
RuntimeError("Reshape should have two to five parameters. Usage: Reshape(origNodeName, numRows, [imageWidth=], [imageHeight=], [imageChannels=].");
|
||||
|
@ -323,18 +307,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t img_channels = node->GetOptionalParameter("imageChannels", "0");
|
||||
|
||||
bool needGradient = node->GetOptionalParameter("needGradient", "false");
|
||||
nodePtr = builder.Reshape(NULL, num_rows, ImageLayoutWHC(img_width, img_height, img_channels), name);
|
||||
nodePtr = builder.DeprecatedReshape(NULL, num_rows, ImageDimensions::AsTensorShape(img_width, img_height, img_channels, ImageLayoutKind::HWC/*legacy*/), name); // BUGBUG: use a tensor descriptor instead
|
||||
nodePtr->SetParameterUpdateRequired(needGradient);
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == OperationNameOf(PastValueNode) ||
|
||||
cnNodeType == OperationNameOf(FutureValueNode))
|
||||
{
|
||||
if (parameter.size() <2 || parameter.size() >3)
|
||||
RuntimeError("PastValue or FutureValue should have two to three fixed parameters. Usage: PastValue(rows, [cols], m, [timeStep=1, defaultPastValue=0.1]).");
|
||||
if (parameter.size() < 2 || parameter.size() > 3) // we allow 3 for legacy (cols parameter which is now unused)
|
||||
RuntimeError("PastValue or FutureValue should have two to three fixed parameters. Usage: PastValue(rows, input, [timeStep=1, defaultPastValue=0.1]).");
|
||||
// TODO: allow a tensor descriptor. Or allow 0 (inference). Maybe already supported--check this.
|
||||
|
||||
nodeParamCount = 1;
|
||||
nodeParamStart = parameter.size() > 2?2:1;
|
||||
nodeParamCount = 1; // number of inputs
|
||||
nodeParamStart = parameter.size() > 2?2:1; // index of input
|
||||
|
||||
if (pass == ndlPassInitial)
|
||||
{
|
||||
|
@ -342,24 +327,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
|
||||
size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
|
||||
// if we have three parameters the second is columns
|
||||
size_t cols = parameter.size() > 2 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
|
||||
// ignore legacy size_t cols = parameter.size() > 2 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
|
||||
|
||||
bool needGradient = node->GetOptionalParameter("needGradient", "false");
|
||||
//bool needGradient = node->GetOptionalParameter("needGradient", "false"); // TODO: what's this for?
|
||||
float defaultHiddenActivity = node->GetOptionalParameter("defaultHiddenActivity", "0.1"); // TODO: parameter should be called 'defaultHiddenActivation'
|
||||
|
||||
//for backward compatibility we check timeStep first
|
||||
// for backward compatibility we check 'timeStep' first
|
||||
size_t timeStep = node->GetOptionalParameter("timeStep", "1");
|
||||
if (timeStep == 1)
|
||||
{
|
||||
timeStep = node->GetOptionalParameter("delayTime", "1");
|
||||
}
|
||||
|
||||
if (cnNodeType == OperationNameOf(PastValueNode))
|
||||
nodePtr = builder.PastValue(NULL, defaultHiddenActivity, rows, cols, timeStep, name);
|
||||
nodePtr = builder.PastValue(NULL, defaultHiddenActivity, rows, timeStep, name);
|
||||
else
|
||||
nodePtr = builder.FutureValue(NULL, defaultHiddenActivity, rows, cols, timeStep, name);
|
||||
nodePtr = builder.FutureValue(NULL, defaultHiddenActivity, rows, timeStep, name);
|
||||
|
||||
nodePtr->SetParameterUpdateRequired(needGradient); // TODO: what's this for?
|
||||
//nodePtr->SetParameterUpdateRequired(needGradient); // TODO: what's this for?
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == OperationNameOf(ConvolutionNode))
|
||||
|
@ -383,16 +366,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t outputChannels = ((NDLNode<ElemType>*)params[id++])->GetScalar();
|
||||
size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
|
||||
size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
|
||||
|
||||
assert (id == 5);
|
||||
|
||||
//optional
|
||||
// optional
|
||||
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
|
||||
bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
|
||||
size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
|
||||
|
||||
|
||||
nodePtr = builder.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
|
||||
horizontalSubsample, verticalSubsample, zeroPadding, name, maxTempMemSizeInSamples);
|
||||
horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding, maxTempMemSizeInSamples, name);
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == OperationNameOf(MaxPoolingNode))
|
||||
|
@ -415,11 +397,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
|
||||
size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
|
||||
size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
|
||||
|
||||
assert (id == 4);
|
||||
|
||||
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
|
||||
|
||||
nodePtr = builder.MaxPooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight,
|
||||
horizontalSubsample, verticalSubsample, name);
|
||||
horizontalSubsample, verticalSubsample, imageLayoutKind, name);
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == OperationNameOf(AveragePoolingNode))
|
||||
|
@ -442,11 +425,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
|
||||
size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
|
||||
size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
|
||||
assert(id == 4);
|
||||
|
||||
assert (id == 4);
|
||||
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
|
||||
|
||||
nodePtr = builder.AveragePooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight,
|
||||
horizontalSubsample, verticalSubsample, name);
|
||||
horizontalSubsample, verticalSubsample, imageLayoutKind, name);
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == OperationNameOf(BatchNormalizationNode))
|
||||
|
@ -543,6 +527,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
}
|
||||
|
||||
// ProcessTensorShapeParameters - assume positional parameters starting from position i are tensor dimensions--parse those.
|
||||
// Is isImage then must be a 3D tensor, which is interpreted as (W,H,C), and optional parameter 'imageLayout' says how.
|
||||
template<class ElemType>
|
||||
TensorShape SynchronousNodeEvaluator<ElemType>::ProcessTensorShapeParameters(const NDLNode<ElemType>* node, const vector<void*> & params, size_t & i, bool isImage, const wstring & cnNodeType/*for error messages only*/)
|
||||
{
|
||||
// gather dims
|
||||
vector<size_t> dims;
|
||||
dims.push_back(((NDLNode<ElemType>*)params[i])->GetScalar()); // first is mandatory
|
||||
for (i++; i < params.size(); i++)
|
||||
dims.push_back(((NDLNode<ElemType>*)params[i])->GetScalar());
|
||||
|
||||
// turn into tensor
|
||||
TensorShape tensorShape(dims);
|
||||
|
||||
// if image then interpret as W, H, C with layout according to optional imageLayout parameter
|
||||
if (isImage)
|
||||
{
|
||||
if (dims.size() != 3)
|
||||
RuntimeError("%ls should have 3 parameters [width, height, numChannels].", cnNodeType.c_str());
|
||||
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
|
||||
tensorShape = ImageDimensions::AsTensorShape(tensorShape[0], tensorShape[1], tensorShape[2], imageLayoutKind);
|
||||
}
|
||||
|
||||
return tensorShape;
|
||||
}
|
||||
|
||||
template class SynchronousExecutionEngine<float>;
|
||||
template class SynchronousExecutionEngine<double>;
|
||||
|
||||
|
|
|
@ -290,7 +290,7 @@ public:
|
|||
{
|
||||
fprintf(stderr, "'multiSeq' tag is defunct.\n");
|
||||
}
|
||||
else if (!_strnicmp(value.c_str(), "eval", 4)) // only compare the first 4 characters
|
||||
else if (!_strnicmp(value.c_str(), "eval", 4)) // only compare the first 4 characters. Yikes!!
|
||||
{
|
||||
SetOutputNode(m_net->EvaluationNodes(), compNode);
|
||||
}
|
||||
|
@ -326,9 +326,10 @@ public:
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
virtual ~SynchronousNodeEvaluator()
|
||||
{
|
||||
}
|
||||
virtual ~SynchronousNodeEvaluator() { }
|
||||
|
||||
protected:
|
||||
TensorShape ProcessTensorShapeParameters(const NDLNode<ElemType>* node, const vector<void*> & params, size_t & i, bool isImage, const wstring & cnNodeType/*for error messages only*/);
|
||||
|
||||
private:
|
||||
ComputationNetworkPtr m_net;
|
||||
|
|
|
@ -489,7 +489,7 @@ namespace Microsoft { namespace MSR { namespace ScriptableObjects {
|
|||
std::vector<C> res;
|
||||
res.reserve(GetSize(Fail));
|
||||
for (const auto & val : values)
|
||||
res.push_back(val);
|
||||
res.push_back(val.ResolveValue()); // resolve upon access
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -196,7 +196,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
LogicError("AddSequence: Sequence added to an MBLayout must overlap with minibatch.");
|
||||
|
||||
// remember it
|
||||
#ifdef _DEBUG
|
||||
#if 0//def _DEBUG
|
||||
auto cap = m_sequences.capacity(); // Some sanity check for debugging a speed regression. This should only show up during the first minibatches, and growing only.
|
||||
m_sequences.push_back(seqDesc);
|
||||
if (cap != m_sequences.capacity())
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// DataTensor.h -- tensor descriptor that describes the inner structure of data vectors
|
||||
// TensorShape.h -- tensor descriptor that describes the inner structure of data vectors
|
||||
//
|
||||
// <copyright file="Sequences.h" company="Microsoft">
|
||||
// <copyright file="TensorShape.h" company="Microsoft">
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// </copyright>
|
||||
//
|
||||
|
@ -90,6 +90,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
T m_data[12];
|
||||
size_t m_size;
|
||||
#ifdef _DEBUG
|
||||
void DebugWipe() { memset(m_data, 0, sizeof(m_data)); } // initialize to 0 to make it look prettier in the debugger
|
||||
#else
|
||||
void DebugWipe() { }
|
||||
#endif
|
||||
public:
|
||||
size_t capacity() const { return _countof(m_data); }
|
||||
size_t size() const { return m_size; }
|
||||
|
@ -103,12 +108,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template<class ITER>
|
||||
void assign(ITER beg, const ITER & end) { clear(); append(beg,end); }
|
||||
void operator=(const SmallVector & other) { m_size = other.m_size; memcpy(m_data, other.m_data, other.m_size * sizeof(T)); }
|
||||
SmallVector(const SmallVector & other) { *this = other; }
|
||||
SmallVector(size_t sz, const T & val) { assign(sz, val); }
|
||||
SmallVector(const SmallVector & other) { DebugWipe(); *this = other; }
|
||||
SmallVector(size_t sz, const T & val) { DebugWipe(); assign(sz, val); }
|
||||
SmallVector(size_t sz) : SmallVector(sz, 0) { }
|
||||
SmallVector() : SmallVector(0) { }
|
||||
SmallVector(const std::vector<T> & v) { assign(v.begin(), v.end()); }
|
||||
SmallVector(const std::initializer_list<T> & l) { assign(l.begin(), l.end()); }
|
||||
SmallVector(const std::vector<T> & v) { DebugWipe(); assign(v.begin(), v.end()); }
|
||||
SmallVector(const std::initializer_list<T> & l) { DebugWipe(); assign(l.begin(), l.end()); }
|
||||
bool operator==(const SmallVector & other) const { return size() == other.size() && !memcmp(data(), other.data(), other.m_size * sizeof(T)); }
|
||||
bool operator!=(const SmallVector & other) const { return !operator==(other); } // duh
|
||||
T operator[](size_t i) const { if (i >= size()) LogicError("SmallVector: index overflow"); return m_data[i]; }
|
||||
|
@ -203,28 +208,28 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
}
|
||||
|
||||
void Load(File& fstream)
|
||||
void Load(File& fstream, bool acceptLegacyFormat = false)
|
||||
{
|
||||
// format: uint32_t n, dim[0], dim[1], ..., dim[n-1]
|
||||
// We are also able to read (but not write) an older format, which stores 3-dimensional tensors as size_t W, H, C
|
||||
uint32_t n, dim;
|
||||
fstream >> n >> dim;
|
||||
if (dim) // heuristic to detect the old format. Old format stores a size_t, i.e. the second uint32_t is 0 (no dimensions are > 4G)
|
||||
uint32_t rank, dim0;
|
||||
fstream >> rank >> dim0;
|
||||
if (!acceptLegacyFormat || dim0 != 0) // heuristic to detect the old format. Old format stores a size_t, i.e. the second uint32_t is 0 (no dimensions are > 4G)
|
||||
{
|
||||
m_dims.resize(n);
|
||||
m_dims[0] = dim;
|
||||
for (size_t i = 1; i < n; i++)
|
||||
m_dims.resize(rank);
|
||||
m_dims[0] = dim0;
|
||||
for (size_t i = 1; i < rank; i++)
|
||||
{
|
||||
fstream >> dim;
|
||||
m_dims[i] = dim;
|
||||
fstream >> dim0;
|
||||
m_dims[i] = dim0;
|
||||
}
|
||||
assert(n == m_dims.size());
|
||||
assert(rank == m_dims.size());
|
||||
}
|
||||
else // detected the old size_t W, H, C format
|
||||
{
|
||||
m_dims.resize(3); // current format is hard-coded for 3, for back compat
|
||||
m_dims[1] = n;
|
||||
fstream >> m_dims[2] >> m_dims[0]; // currently stored in order W, H, C. TODO: general tensor format will be different
|
||||
m_dims.resize(3);
|
||||
m_dims[1] = rank;
|
||||
fstream >> m_dims[2] >> m_dims[0]; // stored in order C, W, H
|
||||
}
|
||||
InitAsNoSlice();
|
||||
}
|
||||
|
@ -243,13 +248,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
const SmallVector<size_t> & GetDims() const { return m_dims; } // get all, e.g. for logging or for constructing derived tensors with edited dimensions
|
||||
const SmallVector<ptrdiff_t> & GetStrides() const { return m_strides; }
|
||||
|
||||
// interpretation as an image tensor
|
||||
size_t GetNumChannels() const { if (m_dims.empty()) return 0; else return m_dims.size() > 0 ? m_dims[0] : 1; }
|
||||
size_t GetWidth() const { if (m_dims.empty()) return 0; else return m_dims.size() > 1 ? m_dims[1] : 1; }
|
||||
size_t GetHeight() const { if (m_dims.empty()) return 0; else return m_dims.size() > 2 ? m_dims[2] : 1; }
|
||||
// heuristics used for pretty-printing
|
||||
// TODO: This will go away.
|
||||
bool IsInputAnImage() const { return GetRank() == 3 && (GetWidth() != 1 || GetNumChannels() != 1); }
|
||||
// legacy helper function for RowSliceNode. Will go away.
|
||||
bool IsVectorStoredAsImage() const { return GetRank() == 3 && m_dims[0] == 1 && m_dims[1] == 1; }
|
||||
|
||||
// indexing
|
||||
|
@ -316,8 +315,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// m_dims = I 1 J K
|
||||
// m_strides = 1 I I I*J
|
||||
// dropping the second dimension
|
||||
// m_dims = I % J K
|
||||
// m_strides = 1 % I I*J
|
||||
// m_dims = I J K
|
||||
// m_strides = 1 I I*J
|
||||
m_dims[j] = m_dims[k];
|
||||
m_strides[j] = m_strides[k];
|
||||
j++;
|
||||
|
@ -442,15 +441,61 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// TODO: Does the same trick work for 2D images?
|
||||
};
|
||||
|
||||
// When constructing an image tensor with the usual W, H, C format, use the following function instead.
|
||||
// This will sort the three parameters into the correct order.
|
||||
// BUGBUG: at several places, a comment says "after multiplication the structure is lost" and the vector dimension
|
||||
// is set as the image height. However, the image height is actually the wrong dimension since images are assumed transposed.
|
||||
// This will get fixed once we get more complete arbitrary tensor support throughout, including better-defined inference rules.
|
||||
static inline TensorShape ImageLayoutWHC(size_t width, size_t height, size_t channels)
|
||||
// image layouts used in CNTK
|
||||
// Nodes that do semantic interpretation of width, height, channel information must know which index they are in.
|
||||
// Eventually this can go away once we switch completely to cudnn layout.
|
||||
// The cudnn layout is actually our layout in order W,H,C.
|
||||
enum ImageLayoutKind
|
||||
{
|
||||
return TensorShape(channels, width, height);
|
||||
HWC, // legacy; default for NDL
|
||||
CHW // cudnn; default for BrainScript
|
||||
};
|
||||
static inline std::string ToString(ImageLayoutKind imageLayoutKind)
|
||||
{
|
||||
if (imageLayoutKind == ImageLayoutKind::CHW) return "CHW";
|
||||
else if (imageLayoutKind == ImageLayoutKind::HWC) return "HWC";
|
||||
else LogicError("ImageLayout: Invalid ImageLayoutKind");
|
||||
}
|
||||
// TODO: we need a constructor from config; that will allow us to generalize
|
||||
static inline ImageLayoutKind ImageLayoutKindFrom(const wstring & s)
|
||||
{
|
||||
if (s == L"CHW" || s == L"cudnn") return ImageLayoutKind::CHW;
|
||||
else if (s == L"HWC" || s == L"legacy") return ImageLayoutKind::HWC;
|
||||
else InvalidArgument("ImageLayoutKindFrom: Unknown ImageLayoutKind '%ls', must be 'CHW' (cudnn) or 'HWC' (CNTK legacy)", s.c_str());
|
||||
}
|
||||
|
||||
// interpret TensorShape as an image descriptor
|
||||
// considering that we support two ways of storingimages
|
||||
struct ImageDimensions
|
||||
{
|
||||
size_t m_width, m_height, m_numChannels;
|
||||
// interpret TensorShape as image
|
||||
ImageDimensions(const TensorShape & shape, ImageLayoutKind imageLayoutKind)
|
||||
{
|
||||
if (shape.GetRank() != 3)
|
||||
InvalidArgument("Convolution operation currently only supports 1D or 2D convolution on 3D tensors.");
|
||||
if (imageLayoutKind == ImageLayoutKind::CHW)
|
||||
{
|
||||
m_width = shape[0];
|
||||
m_height = shape[1];
|
||||
m_numChannels = shape[2];
|
||||
}
|
||||
else if (imageLayoutKind == ImageLayoutKind::HWC)
|
||||
{
|
||||
m_width = shape[1];
|
||||
m_height = shape[2];
|
||||
m_numChannels = shape[0];
|
||||
}
|
||||
else LogicError("WHC: Invalid ImageLayoutKind");
|
||||
}
|
||||
ImageDimensions(size_t width, size_t height, size_t numChannels) : m_width(width), m_height(height), m_numChannels(numChannels) {}
|
||||
// intepret image as TensorShape
|
||||
static TensorShape AsTensorShape(size_t width, size_t height, size_t numChannels, ImageLayoutKind imageLayoutKind/* = ImageLayoutKind::HWC*/)
|
||||
{
|
||||
if (imageLayoutKind == ImageLayoutKind::CHW) return TensorShape(width, height, numChannels);
|
||||
else if (imageLayoutKind == ImageLayoutKind::HWC) return TensorShape(numChannels, width, height);
|
||||
else LogicError("ImageLayout: Invalid ImageLayoutKind");
|
||||
}
|
||||
TensorShape AsTensorShape(ImageLayoutKind imageLayoutKind) { return AsTensorShape(m_width, m_height, m_numChannels, imageLayoutKind); }
|
||||
};
|
||||
|
||||
}}}
|
|
@ -251,7 +251,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Base::Load(fstream, modelVersion);
|
||||
fstream >> m_hasComputed;
|
||||
LoadValue(fstream);
|
||||
}
|
||||
// Note: This loses the sample layout, but that is recovered by Validate().
|
||||
}
|
||||
|
||||
virtual void DumpNodeInfo(const bool printValues, File& fstream) const override
|
||||
{
|
||||
|
|
|
@ -654,8 +654,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
for (auto nodeIter = convolutionNodes.begin(); nodeIter != convolutionNodes.end(); nodeIter++)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<ConvolutionNode<float>>(*nodeIter);
|
||||
node->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
|
||||
auto nodef = dynamic_pointer_cast<ConvolutionNode<float>>(*nodeIter);
|
||||
if (nodef)
|
||||
nodef->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
|
||||
auto noded = dynamic_pointer_cast<ConvolutionNode<double>>(*nodeIter);
|
||||
if (noded)
|
||||
noded->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// please keep this table sorted
|
||||
if (nodeType == OperationNameOf(CRFNode)) return New<CRFNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode))return New<ClassBasedCrossEntropyWithSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
#if 0// change once we no longer see a perf hit to #ifdef ENABLE_TENSORVIEW
|
||||
#ifdef ENABLE_BROADCASTING_ELEMENTTIMES
|
||||
else if (nodeType == L"ColumnElementTimes") return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
#else
|
||||
else if (nodeType == OperationNameOf(ColumnElementTimesNode)) return New<ColumnElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
|
@ -76,7 +76,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
else if (nodeType == OperationNameOf(ReconcileMBLayoutNode)) return New<ReconcileMBLayoutNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(RectifiedLinearNode)) return New<RectifiedLinearNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(ReshapeNode)) return New<ReshapeNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
#if 0// change once we no longer see a perf hit to #ifdef ENABLE_TENSORVIEW
|
||||
#ifdef ENABLE_BROADCASTING_ELEMENTTIMES
|
||||
else if (nodeType == L"RowElementTimes") return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
#else
|
||||
else if (nodeType == OperationNameOf(RowElementTimesNode)) return New<RowElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
|
@ -85,7 +85,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
else if (nodeType == OperationNameOf(DiagonalNode)) return New<DiagonalNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(RowSliceNode)) return New<RowSliceNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(RowStackNode)) return New<RowStackNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
#if 0// change once we no longer see a perf hit to #ifdef ENABLE_TENSORVIEW
|
||||
#ifdef ENABLE_BROADCASTING_ELEMENTTIMES
|
||||
else if (nodeType == L"Scale") return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
#else
|
||||
else if (nodeType == OperationNameOf(ScaleNode)) return New<ScaleNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
|
@ -107,6 +107,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
else if (nodeType == L"Delay") return New<PastValueNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == L"PerDimMeanVarNormalizationNode") return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == L"PerDimMeanVarNormalizationNode") return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
#if 1
|
||||
else if (nodeType == OperationNameOf(DeprecatedReshapeNode)) return New<DeprecatedReshapeNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
#endif
|
||||
else InvalidArgument("Attempted to instantiate undefined operation %ls.", nodeType.c_str());
|
||||
}
|
||||
|
||||
|
@ -116,14 +119,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
static shared_ptr<ComputationNode<ElemType>> CreateNode(const std::wstring & nodeType, _Types&&... _Args)
|
||||
{
|
||||
// check more types
|
||||
if (nodeType == OperationNameOf(AveragePoolingNode)) return New<AveragePoolingNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(ConvolutionNode)) return New<ConvolutionNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(SparseInputValue)) return New<SparseInputValue<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(InputValue)) return New<InputValue<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(LearnableParameter)) return New<LearnableParameter<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(MaxPoolingNode)) return New<MaxPoolingNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
if (nodeType == OperationNameOf(AveragePoolingNode)) return New<AveragePoolingNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(BatchNormalizationNode)) return New<BatchNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(ConvolutionNode)) return New<ConvolutionNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(SparseInputValue)) return New<SparseInputValue<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(InputValue)) return New<InputValue<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(LearnableParameter)) return New<LearnableParameter<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(MaxPoolingNode)) return New<MaxPoolingNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
//else if (nodeType == OperationNameOf(SparseLearnableParameter)) return New<SparseLearnableParameter<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(BatchNormalizationNode)) return New<BatchNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else return CreateStandardNode<ElemType>(nodeType, forward<_Types>(_Args)...);
|
||||
}
|
||||
|
||||
|
@ -175,6 +178,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, rows, cols));
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateLearnableParameter(const std::wstring & paramName, const TensorShape & tensorShape)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, tensorShape));
|
||||
}
|
||||
|
||||
#if 0 // not functional at present
|
||||
//sparse matrix size is optionally specified
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size)
|
||||
|
@ -183,28 +191,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
#endif
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName, const size_t rows, const size_t cols)
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName, const size_t rows)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, rows, cols));
|
||||
return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, rows));
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName, const size_t rows, const size_t cols)
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName, const size_t rows)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, rows, cols));
|
||||
return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, rows));
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName,
|
||||
const TensorShape & imageLayout,
|
||||
const size_t numImages)
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName, const TensorShape & sampleLayout)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout, numImages));
|
||||
return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, sampleLayout));
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName,
|
||||
const TensorShape & imageLayout,
|
||||
const size_t numImages)
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName, const TensorShape & imageLayout)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout, numImages));
|
||||
return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout));
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreatePairNetworkNode(const std::wstring & inputName, const size_t rows, const size_t cols)
|
||||
|
@ -215,37 +219,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateConvolutionNode(const std::wstring & nodeName,
|
||||
const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
|
||||
const size_t horizontalSubsample, const size_t verticalSubsample,
|
||||
const bool zeroPadding,
|
||||
ImageLayoutKind imageLayoutKind, const bool zeroPadding,
|
||||
const size_t maxTempMemSizeInSamples)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
|
||||
kernelWidth, kernelHeight,
|
||||
outputChannels,
|
||||
horizontalSubsample,
|
||||
verticalSubsample, zeroPadding,
|
||||
maxTempMemSizeInSamples));
|
||||
kernelWidth, kernelHeight, outputChannels,
|
||||
horizontalSubsample, verticalSubsample, imageLayoutKind,
|
||||
zeroPadding,
|
||||
maxTempMemSizeInSamples));
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateMaxPoolingNode(const std::wstring & nodeName,
|
||||
const size_t windowWidth,
|
||||
const size_t windowHeight,
|
||||
const size_t horizontalSubsample,
|
||||
const size_t verticalSubsample)
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<MaxPoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
|
||||
windowWidth, windowHeight,
|
||||
horizontalSubsample,
|
||||
verticalSubsample));
|
||||
return net.AddNodeToNetWithElemType(New<MaxPoolingNode<ElemType>>(net.GetDeviceId(), nodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind));
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateAveragePoolingNode(const std::wstring & nodeName, const size_t windowWidth,
|
||||
const size_t windowHeight, const size_t horizontalSubsample,
|
||||
const size_t verticalSubsample)
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateAveragePoolingNode(const std::wstring & nodeName,
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<AveragePoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
|
||||
windowWidth, windowHeight,
|
||||
horizontalSubsample,
|
||||
verticalSubsample));
|
||||
return net.AddNodeToNetWithElemType(New<AveragePoolingNode<ElemType>>(net.GetDeviceId(), nodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind));
|
||||
}
|
||||
|
||||
// this is the catch-all for all cases not covered as special cases above
|
||||
|
@ -274,49 +267,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convolution(const ComputationNodePtr weight,
|
||||
const ComputationNodePtr inputValues,
|
||||
const size_t kernelWidth,
|
||||
const size_t kernelHeight,
|
||||
const size_t outputChannels,
|
||||
const size_t horizontalSubsample,
|
||||
const size_t verticalSubsample,
|
||||
const bool zeroPadding,
|
||||
const std::wstring nodeName,
|
||||
const size_t maxTempMemSizeInSamples)
|
||||
const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding, const size_t maxTempMemSizeInSamples,
|
||||
const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
|
||||
kernelWidth, kernelHeight,
|
||||
outputChannels,
|
||||
horizontalSubsample,
|
||||
verticalSubsample, zeroPadding,
|
||||
kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding,
|
||||
maxTempMemSizeInSamples),
|
||||
weight, inputValues);
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MaxPooling(const ComputationNodePtr inputValues,
|
||||
const size_t windowWidth,
|
||||
const size_t windowHeight,
|
||||
const size_t horizontalSubsample,
|
||||
const size_t verticalSubsample,
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
|
||||
const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<MaxPoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
|
||||
windowWidth, windowHeight,
|
||||
horizontalSubsample,
|
||||
verticalSubsample),
|
||||
windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind),
|
||||
inputValues);
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::AveragePooling(const ComputationNodePtr inputValues,
|
||||
const size_t windowWidth,
|
||||
const size_t windowHeight,
|
||||
const size_t horizontalSubsample,
|
||||
const size_t verticalSubsample,
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
|
||||
const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<AveragePoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
|
||||
windowWidth, windowHeight,
|
||||
horizontalSubsample,
|
||||
verticalSubsample),
|
||||
windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind),
|
||||
inputValues);
|
||||
}
|
||||
|
||||
|
@ -486,7 +460,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return net.AddNodeToNetAndAttachInputs(New<SumElementsNode<ElemType>>(net.GetDeviceId(), nodeName), a);
|
||||
}
|
||||
|
||||
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
|
||||
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Scale(const ComputationNodePtr scalar, const ComputationNodePtr matrix, const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<ScaleNode<ElemType>>(net.GetDeviceId(), nodeName), scalar, matrix);
|
||||
|
@ -513,7 +487,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return net.AddNodeToNetAndAttachInputs(New<ElementTimesNode<ElemType>>(net.GetDeviceId(), nodeName), a, b);
|
||||
}
|
||||
|
||||
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
|
||||
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<RowElementTimesNode<ElemType>>(net.GetDeviceId(), nodeName), a, b);
|
||||
|
@ -561,12 +535,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Reshape(const ComputationNodePtr a,
|
||||
const size_t numRows,
|
||||
const TensorShape & imageLayout,
|
||||
const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<ReshapeNode<ElemType>>(net.GetDeviceId(), nodeName, numRows, imageLayout), a);
|
||||
return net.AddNodeToNetAndAttachInputs(New<ReshapeNode<ElemType>>(net.GetDeviceId(), nodeName, imageLayout), a);
|
||||
}
|
||||
#if 1
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::DeprecatedReshape(const ComputationNodePtr a,
|
||||
const size_t numRows,
|
||||
const TensorShape & imageLayout,
|
||||
const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<DeprecatedReshapeNode<ElemType>>(net.GetDeviceId(), nodeName, numRows, imageLayout), a);
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowRepeat(const ComputationNodePtr a, const size_t num_repeat, const std::wstring nodeName)
|
||||
{
|
||||
|
@ -578,14 +560,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return net.AddNodeToNetAndAttachInputs(New<DiagonalNode<ElemType>>(net.GetDeviceId(), nodeName), a);
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName)
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<PastValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, col_size, timeStep), a);
|
||||
return net.AddNodeToNetAndAttachInputs(New<PastValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, timeStep), a);
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName)
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<FutureValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, col_size, timeStep), a);
|
||||
return net.AddNodeToNetAndAttachInputs(New<FutureValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, timeStep), a);
|
||||
}
|
||||
|
||||
template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Parallel(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include "ComputationNetwork.h"
|
||||
#include "TrainingCriterionNodes.h" // for NCEEvalMode
|
||||
#include "ScriptableObjects.h"
|
||||
#include "TensorShape.h"
|
||||
#include <string>
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
@ -39,47 +40,34 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// TODO: separate into nodes that have inputs and those that duplicate functions with input adding except just not adding inputs. Clear?
|
||||
|
||||
ComputationNodePtr CreateLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols);
|
||||
ComputationNodePtr CreateLearnableParameter(const std::wstring & paramName, const TensorShape & tensorShape);
|
||||
//sparse matrix size is optionally specified
|
||||
//ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0);
|
||||
ComputationNodePtr CreateInputNode(const std::wstring & inputName, const size_t rows, const size_t cols);
|
||||
ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const size_t rows, const size_t cols);
|
||||
ComputationNodePtr CreateInputNode(const std::wstring & inputName, const TensorShape & imageLayout, const size_t numImages);
|
||||
ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const TensorShape & imageLayout, const size_t numImages);
|
||||
ComputationNodePtr CreateInputNode(const std::wstring & inputName, const size_t rows);
|
||||
ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const size_t rows);
|
||||
ComputationNodePtr CreateInputNode(const std::wstring & inputName, const TensorShape & sampleLayout);
|
||||
ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const TensorShape & sampleLayout);
|
||||
ComputationNodePtr CreatePairNetworkNode(const std::wstring & inputName, const size_t rows, const size_t cols);
|
||||
ComputationNodePtr CreateConvolutionNode(const std::wstring & nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
|
||||
ComputationNodePtr CreateMaxPoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
|
||||
ComputationNodePtr CreateAveragePoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
|
||||
ComputationNodePtr CreateConvolutionNode(const std::wstring & nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
|
||||
ComputationNodePtr CreateMaxPoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
|
||||
ComputationNodePtr CreateAveragePoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
|
||||
// this is the catch-all for all cases not covered as special cases above
|
||||
// Unlike the specialized ones above, this one creates nodes by type given as a string.
|
||||
ComputationNodePtr CreateComputationNode(const std::wstring & nodeType, const std::wstring & nodeName);
|
||||
// TODO: These next three functions are wrappers around CreateXXXNode(). Remove these.
|
||||
ComputationNodePtr Parameter(const size_t rows, size_t cols, const std::wstring nodeName = L"") { return CreateLearnableParameter(nodeName, rows, cols); } // TODO: remove
|
||||
ComputationNodePtr Input(const size_t rows, const size_t cols, const std::wstring nodeName = L"") { return CreateInputNode(nodeName, rows, cols); } // TODO: remove
|
||||
ComputationNodePtr Input(const TensorShape & imageLayout, const size_t numImages, const std::wstring nodeName = L"") { return CreateInputNode(nodeName, imageLayout, numImages); } // TODO: remove
|
||||
// The following functions create nodes and link them to the network and their inputs.
|
||||
// TODO: Do we need both this set and the one above that does not add inputs? Can they share more code?
|
||||
ComputationNodePtr PairNetwork(const ComputationNodePtr & a, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Convolution(const ComputationNodePtr weight,
|
||||
const ComputationNodePtr inputValues,
|
||||
const size_t kernelWidth,
|
||||
const size_t kernelHeight,
|
||||
const size_t outputChannels,
|
||||
const size_t horizontalSubsample,
|
||||
const size_t verticalSubsample,
|
||||
const bool zeroPadding = false,
|
||||
const std::wstring nodeName = L"",
|
||||
const size_t maxTempMemSizeInSamples = 0);
|
||||
const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
|
||||
const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
|
||||
const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0,
|
||||
const std::wstring nodeName = L"");
|
||||
ComputationNodePtr MaxPooling(const ComputationNodePtr inputValues,
|
||||
const size_t windowWidth,
|
||||
const size_t windowHeight,
|
||||
const size_t horizontalSubsample,
|
||||
const size_t verticalSubsample,
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
|
||||
const std::wstring nodeName = L"");
|
||||
ComputationNodePtr AveragePooling(const ComputationNodePtr inputValues,
|
||||
const size_t windowWidth,
|
||||
const size_t windowHeight,
|
||||
const size_t horizontalSubsample,
|
||||
const size_t verticalSubsample,
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
|
||||
const std::wstring nodeName = L"");
|
||||
ComputationNodePtr ErrorPrediction(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr PerDimMeanVarNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");
|
||||
|
@ -111,14 +99,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr Hardmax(const ComputationNodePtr a, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr LogSoftmax(const ComputationNodePtr a, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Sum(const ComputationNodePtr a, const std::wstring nodeName = L"");
|
||||
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
|
||||
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
|
||||
ComputationNodePtr Scale(const ComputationNodePtr scalar, const ComputationNodePtr matrix, const std::wstring nodeName = L"");
|
||||
#endif
|
||||
ComputationNodePtr Transpose(const ComputationNodePtr matrix, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Times(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr TransposeTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr ElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
|
||||
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
|
||||
ComputationNodePtr RowElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr ColumnElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
#endif
|
||||
|
@ -129,11 +117,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNodePtr Plus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Minus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Dropout(const ComputationNodePtr a, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Reshape(const ComputationNodePtr a, const size_t num_rows, const TensorShape & imageLayout, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Reshape(const ComputationNodePtr a, const TensorShape & imageLayout, const std::wstring nodeName = L"");
|
||||
#if 1 // legacy
|
||||
ComputationNodePtr DeprecatedReshape(const ComputationNodePtr a, const size_t num_rows, const TensorShape & imageLayout, const std::wstring nodeName = L"");
|
||||
#endif
|
||||
ComputationNodePtr RowRepeat(const ComputationNodePtr a, const size_t num_repeat, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Diagonal(const ComputationNodePtr a, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Parallel(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr RowSlice(const ComputationNodePtr a, const size_t start_index, const size_t num_rows, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr RowStack(const std::vector<ComputationNodePtr> pinputs, const std::wstring nodeName = L"");
|
||||
|
|
|
@ -649,7 +649,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// We do call validate(final) as many times as needed, since stuff may have changed underneath.
|
||||
node->PrintSelfBeforeValidation();
|
||||
node->Validate(isFinalValidationPass/*final*/); // all nodes have been visited: do verification instead of just inference
|
||||
fprintf(stderr, " -> [%lu, %s%lu]", node->GetNumRows(), node->HasMBLayout() ? "MBSize " : "", node->GetNumCols());
|
||||
fprintf(stderr, " -> [%lu [%s], %s%lu]", node->GetNumRows(), string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? "MBSize " : "", node->GetNumCols());
|
||||
node->m_visited = true;
|
||||
// also take the opportunity to propagate m_needsGradient
|
||||
auto needsGradient = node->m_needsGradient;
|
||||
|
|
|
@ -155,7 +155,7 @@
|
|||
<ClInclude Include="..\Common\Include\Basics.h" />
|
||||
<ClInclude Include="..\Common\Include\BestGpu.h" />
|
||||
<ClInclude Include="..\Common\Include\Config.h" />
|
||||
<ClInclude Include="..\Common\Include\DataTensor.h" />
|
||||
<ClInclude Include="..\Common\Include\TensorShape.h" />
|
||||
<ClInclude Include="..\Common\Include\File.h" />
|
||||
<ClInclude Include="..\Common\Include\fileutil.h" />
|
||||
<ClInclude Include="..\Common\Include\Platform.h" />
|
||||
|
|
|
@ -117,7 +117,7 @@
|
|||
<ClInclude Include="EsotericNodes.h">
|
||||
<Filter>Nodes</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\DataTensor.h">
|
||||
<ClInclude Include="..\Common\Include\TensorShape.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\Config.h">
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
#include "ComputationNode.h"
|
||||
#include "InputAndParamNodes.h"
|
||||
#include "ComputationNetworkBuilder.h" // TODO: We should only pull in NewComputationNodeFromConfig(). Nodes should not know about network at large.
|
||||
#include "DataTensor.h"
|
||||
#include "TensorShape.h"
|
||||
|
||||
#ifndef let
|
||||
#define let const auto
|
||||
|
@ -72,6 +72,7 @@ namespace Microsoft {
|
|||
size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
|
||||
size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols();
|
||||
|
||||
#if 1//ndef ENABLE_TENSORVIEW
|
||||
// TODO: This test will go away once we switch to full tensor lib.
|
||||
if (isFinalValidationPass && !(
|
||||
(rows0 == rows1 && (Input(0)->GetMBLayout() == Input(1)->GetMBLayout() || cols0 == cols1)) || // matching size (obvious case)
|
||||
|
@ -81,6 +82,9 @@ namespace Microsoft {
|
|||
{
|
||||
LogicError("The Matrix dimensions in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
|
||||
}
|
||||
#else
|
||||
rows0; rows1;
|
||||
#endif
|
||||
|
||||
// result has tensor shape with dimensions being the max over both
|
||||
let shape0 = GetInputSampleLayout(0);
|
||||
|
@ -98,7 +102,7 @@ namespace Microsoft {
|
|||
dims[k] = dim1; // then use dimension we broadcast to
|
||||
else if (dim1 == 1) // if [1] is broadcasting
|
||||
; // dims is already correct
|
||||
else if (dim1 != dims[k]) // no broadcasting: they must match
|
||||
else if (isFinalValidationPass && dim1 != dims[k]) // no broadcasting: they must match
|
||||
InvalidArgument("%ls %ls operation: Input dimensions [%s] and [%s] are not compatible.",
|
||||
NodeName().c_str(), OperationName().c_str(), string(shape0).c_str(), string(shape1).c_str());
|
||||
}
|
||||
|
@ -181,9 +185,6 @@ namespace Microsoft {
|
|||
if (m_sampleLayout.GetDim(k) == 0 || m_sampleLayout.GetDim(k) == SIZE_MAX)
|
||||
layoutPlausible = false;
|
||||
}
|
||||
// some code initializes it to (1,1,rowDim)
|
||||
if (m_sampleLayout.GetRank() == 3 && m_sampleLayout.GetDim(0) == 1 && m_sampleLayout.GetDim(1) == 1)
|
||||
layoutPlausible = false;
|
||||
// check dimension
|
||||
if (GetNumRows() != m_sampleLayout.GetNumElements())
|
||||
layoutPlausible = false;
|
||||
|
@ -204,6 +205,8 @@ namespace Microsoft {
|
|||
for (size_t i = 0; i < GetNumInputs(); i++)
|
||||
{
|
||||
size_t rank = Input(i)->GetAndValidateSampleLayout().GetRank();
|
||||
if (!HasMBLayout()) // no MBLayout: last dim is column dimension
|
||||
rank++;
|
||||
if (maxRank < rank)
|
||||
maxRank = rank;
|
||||
}
|
||||
|
@ -215,8 +218,9 @@ namespace Microsoft {
|
|||
TensorShape ComputationNodeBase::GetTensorShape(size_t rank, const FrameRange & fr) const
|
||||
{
|
||||
//GetAndValidateSampleLayout(); // no need to validate because rank comes from DetermineElementwiseTensorRank() which validates all
|
||||
if (!HasMBLayout()) // no MBLayout: just return sample layout (if other participants have layout, tensor lib will broadcast)
|
||||
return GetSampleLayout(); // .Pad(rank); // no need for padding
|
||||
if (!HasMBLayout())
|
||||
return GetSampleLayout().Append(GetSampleLayout().GetRank(), GetNumCols()); // last dim is column dimension
|
||||
// TODO: This is not nice! Instead, of no MBLayout then have sample layout explain whole matrix.
|
||||
else if (fr.IsAllFrames())
|
||||
{
|
||||
// we have an MBLayout, and for refers to the entire MB
|
||||
|
@ -301,6 +305,7 @@ namespace Microsoft { namespace MSR { namespace ScriptableObjects {
|
|||
static TensorShape TensorShapeFromConfig(const IConfigRecord & config)
|
||||
{
|
||||
const auto & valp = config[L"dims"];
|
||||
// TODO: Add code that if input is already a tensor shape it is also OK.
|
||||
if (valp.Is<ConfigArray>())
|
||||
return TensorShape(valp.AsRef<ConfigArray>().AsVector<size_t>([&](const wstring & msg){ valp.Fail(msg); }));
|
||||
else
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
#include "TensorView.h"
|
||||
#include "ScriptableObjects.h"
|
||||
#include "Sequences.h"
|
||||
#include "DataTensor.h"
|
||||
#include "TensorShape.h"
|
||||
#include "MatrixPool.h"
|
||||
|
||||
#include <unordered_set>
|
||||
|
@ -26,7 +26,9 @@
|
|||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
// #define ENABLE_TENSORVIEW // flip this switch once the tensor lib is confirmed to be working
|
||||
// remove these following two #defines once the tensor lib works
|
||||
#define ENABLE_TENSORVIEW // if set then tensor lib is used instead of old Matrix implementations, wherever such an implementation exists
|
||||
#define ENABLE_BROADCASTING_ELEMENTTIMES // if set then ScaleNode and Row/ColumnElementTimes are redirected to ElementTimes
|
||||
|
||||
#define DEFAULT_HIDDEN_ACTIVATION 0.1
|
||||
|
||||
|
@ -307,6 +309,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// - PairNetworkNode
|
||||
// - LSTMNode
|
||||
// set our dimensions (rows, cols, sample layout)
|
||||
// TODO: Separate SetDims() into version with and without MBLayout.
|
||||
void SetDims(const TensorShape & sampleLayout, size_t cols)
|
||||
{
|
||||
m_sampleLayout = sampleLayout;
|
||||
|
@ -501,9 +504,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
const char * mbSizeMark = child->m_pMBLayout ? "MBSize " : "";
|
||||
if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout.GetWidth() != 1 || child->m_sampleLayout.GetNumChannels() != 1)) // looks like an image: use WHC notation
|
||||
fprintf(stderr, "%ls[%lu {W=%lu, H=%lu, C=%lu}, %s%lu]", child->NodeName().c_str(), child->GetNumRows(),
|
||||
child->m_sampleLayout.GetWidth(), child->m_sampleLayout.GetHeight(), child->m_sampleLayout.GetNumChannels(), mbSizeMark, child->GetNumCols());
|
||||
if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1)) // looks like an image: use WHC notation
|
||||
fprintf(stderr, "%ls[%lu [%s] {W=%lu, H=%lu, C=%lu}, %s%lu]", child->NodeName().c_str(), child->GetNumRows(), string(child->m_sampleLayout).c_str(),
|
||||
child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0], mbSizeMark, child->GetNumCols());
|
||||
//BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct.
|
||||
else if (child->m_sampleLayout.GetRank() > 1) // tensor: output the tensor dimensions --TODO: there will be no numRows in the future, only the tensor
|
||||
fprintf(stderr, "%ls[%lu [%s], %s%lu]", child->NodeName().c_str(), child->GetNumRows(), string(child->m_sampleLayout).c_str(), mbSizeMark, child->GetNumCols());
|
||||
else
|
||||
|
@ -538,14 +542,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop;
|
||||
}
|
||||
|
||||
// TODO: Remove this.
|
||||
// used from:
|
||||
// - Plus/Minus/ElementTimesNode --> replace by max dim over inputs. Make this standard behavior for all binary element-wise ops.
|
||||
bool IsInputAnImage(const size_t index) const
|
||||
{
|
||||
return m_inputs[index]->m_sampleLayout.IsInputAnImage();
|
||||
}
|
||||
|
||||
const size_t GetNumInputs() const { return m_inputs.size(); }
|
||||
|
||||
virtual void SetInput(const size_t childIndex, const ComputationNodeBasePtr& node) = 0;
|
||||
|
@ -825,7 +821,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
fstream >> Value();
|
||||
// above reads dimensions, so we must update our own m_numRows/m_numCols
|
||||
SetDims(TensorShape(Value().GetNumRows()), Value().GetNumCols());
|
||||
// BUGBUG: This looses the sample layout (tensor shape). It should be serialized as well.
|
||||
// BUGBUG: This looses the sample layout (tensor shape). The caller must know this and fix it up if needed (currently needed for LearnableParameterNode).
|
||||
}
|
||||
|
||||
// reader updated m_functionValue--update our internal state, i.e. m_numCols
|
||||
|
@ -1403,7 +1399,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template<class C, class... _Types> inline shared_ptr<C> New(_Types&&... _Args)
|
||||
{
|
||||
return make_shared<C>(forward<_Types>(_Args)...);
|
||||
//return ComputationNode<typename C::OurElemType>::template New<C>(forward<_Types>(_Args)...);
|
||||
}
|
||||
|
||||
// =======================================================================
|
||||
|
@ -1526,7 +1521,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
#define UsingComputationNodeMembers /*without OperationName; needed to support inconsistent pattern of InputValue--TODO: This comment it out of date. */ \
|
||||
protected: \
|
||||
typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr; \
|
||||
using Base::m_deviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
|
||||
using Base::m_deviceId; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
|
||||
using Base::m_pMBLayout; using Base::GetNumTimeSteps; using Base::GetNumParallelSequences; \
|
||||
using Base::MaskMissingColumnsToZero; using Base::MaskMissingValueColumnsToZero; using Base::MaskMissingGradientColumnsToZero; using Base::InvalidateMissingValueColumns; using Base::InvalidateMissingGradientColumns; \
|
||||
using Base::DataFor; using Base::ValueFor; using Base::Gradient; using Base::GradientFor; \
|
||||
|
@ -1540,12 +1535,12 @@ protected: \
|
|||
using Base::GetNumInputs; using Base::ZeroGradientsOfInputs; using Base::VerifyDims; \
|
||||
using Base::ConstOnes; \
|
||||
using Base::DetermineElementwiseTensorRank; \
|
||||
using Base::GetInputSampleLayout; using Base::InferMBLayoutFromInputsForStandardCase; \
|
||||
using Base::GetSampleLayout; using Base::GetInputSampleLayout; using Base::InferMBLayoutFromInputsForStandardCase; \
|
||||
using Base::CopyTo; using Base::CreateUniqNodeName; using Base::DetachInputs; using Base::GetInputsFromConfig; \
|
||||
using Base::DumpNodeInfo; using Base::EnumerateNodes; \
|
||||
using Base::HasMBLayout; using Base::GetMBLayout; using Base::LinkToMBLayout; \
|
||||
using Base::Input; using Base::SetInput; \
|
||||
using Base::IsInputAnImage; using Base::IsEqualTo; using Base::IsOutputOlderThanInputs; using Base::IsLeaf; using Base::SetParameterUpdateRequired; \
|
||||
using Base::IsEqualTo; using Base::IsOutputOlderThanInputs; using Base::IsLeaf; using Base::SetParameterUpdateRequired; \
|
||||
using Base::Load; \
|
||||
using Base::PrintNodeValuesToFile; using Base::PrintSelfBeforeValidation; \
|
||||
using Base::Save; using Base::UpdateFunctionMBSize; \
|
||||
|
@ -1570,6 +1565,31 @@ protected: /* some boilerplate goes here */ \
|
|||
// a few standard base classes for N-nary operations
|
||||
// =======================================================================
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// UnaryElementWiseNode (operand)
|
||||
//
|
||||
// unary elementwise operations that are implemented with the tensor lib
|
||||
//
|
||||
// Derived clases only need to override ForwardProp() and BackpropTo().
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class UnaryElementWiseNode : public ComputationNode<ElemType>, public NumInputs<1>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
|
||||
public:
|
||||
UnaryElementWiseNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
ValidateUnaryMap(isFinalValidationPass);
|
||||
}
|
||||
};
|
||||
|
||||
#define UsingUnaryElementwiseNodeBaseMembers UsingComputationNodeMembersBoilerplate;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// BinaryElementWiseNode (operand1, operand2)
|
||||
//
|
||||
|
@ -1598,13 +1618,9 @@ protected: /* some boilerplate goes here */ \
|
|||
#endif
|
||||
}
|
||||
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
|
||||
{
|
||||
// By default, the BinaryElementWiseNode does not require any of it's input's values for computing
|
||||
// the gradients of its input nodes
|
||||
UNREFERENCED_PARAMETER(childIndex);
|
||||
return false;
|
||||
}
|
||||
// By default, the BinaryElementWiseNode does not require any of it's input's values for computing
|
||||
// the gradients of its input nodes
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
|
||||
|
||||
virtual void /*IComputationNode::*/BeginForwardProp() override // called before first iteration step of ForwardProp()
|
||||
{
|
||||
|
|
|
@ -30,9 +30,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// ConvolutionNode (convolutionWeights, inputFeature)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// convolutional network
|
||||
// This follows "high performance convolutional neural networks for document processing" by Kumar Chellapilla, Sidde Puri, and Patrice Simard.
|
||||
// Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
|
||||
// Convolutions (incl. pooling) support two different storage formats:
|
||||
// BUGBUG: These are currently hard-selected depending on circumstances, without being reflected in TensoShape.
|
||||
//
|
||||
// * legacy mode (CPU and GPU without cudnn): Channels are tuples of scalars
|
||||
//
|
||||
// This follows "high performance convolutional neural networks for document processing" by Kumar Chellapilla, Sidde Puri, and Patrice Simard.
|
||||
// Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
|
||||
//
|
||||
// - input : [C x W x H x T] or ARRAY[1..T] OF ARRAY[1..H] OF ARRAY[1..W] OF ARRAY[1..C]
|
||||
// - output : [C' x W' x H' x T] or ARRAY[1..T] OF ARRAY[1..H'] OF ARRAY[1..W'] OF ARRAY[1..C']
|
||||
// - filter : [C' x W" x H" x C ] or ARRAY[1..C] OF ARRAY[1..H"] OF ARRAY[1..W"] OF ARRAY[1..C']
|
||||
//
|
||||
// * GPU with cudnn: Channels are planes
|
||||
//
|
||||
// - input : [W x H x C x T] or ARRAY[1..T] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W]
|
||||
// - output : [W' x H' x C' x T] or ARRAY[1..T] OF ARRAY[1..C'] OF ARRAY[1..H'] OF ARRAY[1..W']
|
||||
// - filter : [W" x H" x C x C' ] or ARRAY[1..C'] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W]
|
||||
//
|
||||
// where:
|
||||
// - using ' for output and " for filter
|
||||
// - T = samples (NVidia calls this N)
|
||||
// - W, H = width, height (W', H' for output, W", H" for kernel)
|
||||
// - C = input channels
|
||||
// - 3 for color images, 1 for B&W images
|
||||
// - for hidden layer: dimension of activation vector for each pixel
|
||||
// - C' = output channels = dimension of activation vector for each pixel (also called N by NVidia, inconsistently)
|
||||
template<class ElemType>
|
||||
class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
|
||||
{
|
||||
|
@ -44,22 +67,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_kernelWidth(SIZE_MAX), m_kernelHeight(SIZE_MAX),
|
||||
// initialize to dummy values so we catch missing initialization
|
||||
m_horizontalSubsample(SIZE_MAX), m_verticalSubsample(SIZE_MAX),
|
||||
m_zeroPadding(false), m_maxTempMemSizeInSamples(SIZE_MAX)
|
||||
m_zeroPadding(false), m_maxTempMemSizeInSamples(SIZE_MAX),
|
||||
m_imageLayoutKind(ImageLayoutKind::HWC)
|
||||
{
|
||||
SetDims(ImageLayoutWHC(1, 1, 0), 0); // TODO: what is this magic #channels == 0? Can this even be initialized at this time, or only inferred?
|
||||
SetDims(ImageDimensions::AsTensorShape(1, 1, 0, m_imageLayoutKind), 0);
|
||||
}
|
||||
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0) :
|
||||
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
|
||||
const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0) :
|
||||
Base(deviceId, name),
|
||||
m_outputChannels(outputChannels),
|
||||
m_kernelWidth(kernelWidth), m_kernelHeight(kernelHeight),
|
||||
m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample),
|
||||
m_zeroPadding(zeroPadding), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples)
|
||||
m_zeroPadding(zeroPadding), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples),
|
||||
m_imageLayoutKind(imageLayoutKind)
|
||||
{
|
||||
SetDims(ImageLayoutWHC(1, 1, outputChannels), 0);
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId);
|
||||
SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0); // TODO: necessary?
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
}
|
||||
ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelWidth"), configp->Get(L"kernelHeight"), configp->Get(L"outputChannels"),
|
||||
configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"),
|
||||
configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")),
|
||||
configp->Get(L"zeroPadding"), configp->Get(L"maxTempMemSizeInSamples"))
|
||||
{
|
||||
// weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0
|
||||
|
@ -70,18 +97,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
Base::Save(fstream);
|
||||
fstream << m_kernelWidth << m_kernelHeight << m_horizontalSubsample << m_verticalSubsample;
|
||||
fstream << m_sampleLayout.GetNumChannels();
|
||||
uint32_t imageLayoutKind = (uint32_t)m_imageLayoutKind;
|
||||
uint32_t outputChannels = (uint32_t)m_outputChannels;
|
||||
fstream << outputChannels << imageLayoutKind;
|
||||
fstream << m_zeroPadding << m_maxTempMemSizeInSamples;
|
||||
}
|
||||
|
||||
void Load(File& fstream, size_t modelVersion) override
|
||||
{
|
||||
Base::Load(fstream, modelVersion);
|
||||
fstream >> m_kernelWidth >> m_kernelHeight >> m_horizontalSubsample >> m_verticalSubsample;
|
||||
size_t outputChannels;
|
||||
fstream >> outputChannels;
|
||||
SetDims(ImageLayoutWHC(1, 1, outputChannels), 0);
|
||||
fstream >> m_kernelWidth >> m_kernelHeight >> m_horizontalSubsample >> m_verticalSubsample;
|
||||
uint32_t imageLayoutKind, outputChannels;
|
||||
fstream >> outputChannels >> imageLayoutKind;
|
||||
m_imageLayoutKind = (ImageLayoutKind) imageLayoutKind;
|
||||
m_outputChannels = outputChannels;
|
||||
SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0); // TODO: needed?
|
||||
fstream >> m_zeroPadding >> m_maxTempMemSizeInSamples;
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
}
|
||||
|
||||
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
||||
|
@ -100,6 +132,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples;
|
||||
|
||||
node->m_imageLayoutKind = m_imageLayoutKind;
|
||||
|
||||
*node->m_tempMatrix = *m_tempMatrix;
|
||||
}
|
||||
}
|
||||
|
@ -139,7 +173,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
// REVIEW alexeyk: setting batch size, can it be done elsewhere in a single place? TODO: Yes, in BeginForwardProp().
|
||||
// update the tensor dimension w.r.t. number of samples
|
||||
size_t batchSize = sliceInput1Value.GetNumCols();
|
||||
m_inT->setN(batchSize);
|
||||
m_outT->setN(batchSize);
|
||||
|
@ -154,7 +188,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
#endif
|
||||
}
|
||||
|
||||
// BUGBUG: Should not be here. Use PlusNode and m_sampleLayout.
|
||||
// BUGBUG: Should not be here. Use PlusNode and m_sampleLayout. TODO: Bad naming:'output' is actually an 'input'
|
||||
void AddBias(const Matrix<ElemType>& output, const Matrix<ElemType>& bias, Matrix<ElemType>& dst)
|
||||
{
|
||||
assert(m_convEng != nullptr);
|
||||
|
@ -173,83 +207,80 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase();
|
||||
|
||||
// get input tensor shape
|
||||
auto inputSampleLayout = GetInputSampleLayout(1);
|
||||
// get input and output tensor shape and interpret as image dimensions
|
||||
auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);
|
||||
|
||||
if (inputSampleLayout.GetWidth() < m_kernelWidth || inputSampleLayout.GetHeight() < m_kernelHeight)
|
||||
InvalidArgument("inputWidth must >= kernelWidth and inputHeight must >= kernelHeight.");
|
||||
if (isFinalValidationPass && (inDims.m_width < m_kernelWidth || inDims.m_height < m_kernelHeight))
|
||||
InvalidArgument("%ls %ls operation requires that input width be >= kernelWidth and input height >= kernelHeight.", NodeName().c_str(), OperationName().c_str());
|
||||
|
||||
// determine output tensor shape
|
||||
// WATCH OUT: Number of channels is tucked away in m_sampleLayout and must be propagated.
|
||||
TensorShape outputSampleLayout;
|
||||
if (m_zeroPadding)
|
||||
{
|
||||
const int kernelWidthCenter = m_kernelWidth % 2;
|
||||
const int kernelHeightCenter = m_kernelHeight % 2;
|
||||
outputSampleLayout = ImageLayoutWHC(
|
||||
(inputSampleLayout.GetWidth() - kernelWidthCenter) / m_horizontalSubsample + 1,
|
||||
(inputSampleLayout.GetHeight() - kernelHeightCenter) / m_verticalSubsample + 1,
|
||||
m_sampleLayout.GetNumChannels());
|
||||
}
|
||||
else
|
||||
{
|
||||
outputSampleLayout = ImageLayoutWHC(
|
||||
(inputSampleLayout.GetWidth() - m_kernelWidth) / m_horizontalSubsample + 1,
|
||||
(inputSampleLayout.GetHeight() - m_kernelHeight) / m_verticalSubsample + 1,
|
||||
m_sampleLayout.GetNumChannels());
|
||||
}
|
||||
const int kernelWidthCenter = m_zeroPadding ? m_kernelWidth % 2 : m_kernelWidth;
|
||||
const int kernelHeightCenter = m_zeroPadding ? m_kernelHeight % 2 : m_kernelHeight;
|
||||
auto outDims = ImageDimensions(
|
||||
(inDims.m_width - kernelWidthCenter) / m_horizontalSubsample + 1,
|
||||
(inDims.m_height - kernelHeightCenter) / m_verticalSubsample + 1,
|
||||
m_outputChannels);
|
||||
|
||||
size_t weightCols = m_kernelWidth * m_kernelHeight * inputSampleLayout.GetNumChannels();
|
||||
size_t weightCols = m_kernelWidth * m_kernelHeight * inDims.m_numChannels;
|
||||
|
||||
// check/infer input [0] (weights)
|
||||
if (Input(0)->Value().HasNoElements())
|
||||
ValidateInferInputDims(0, outputSampleLayout.GetNumChannels(), weightCols);
|
||||
ValidateInferInputDims(0, m_outputChannels, weightCols);
|
||||
|
||||
if (isFinalValidationPass && (Input(0)->GetNumCols() != weightCols || Input(0)->GetNumRows() != outputSampleLayout.GetNumChannels()))
|
||||
LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int)outputSampleLayout.GetNumChannels(), (int)weightCols);
|
||||
if (isFinalValidationPass && (Input(0)->GetNumCols() != weightCols || Input(0)->GetNumRows() != m_outputChannels))
|
||||
LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int)m_outputChannels, (int)weightCols);
|
||||
|
||||
size_t inputDim = inputSampleLayout.GetWidth() * inputSampleLayout.GetHeight() * inputSampleLayout.GetNumChannels();
|
||||
// check/infer input [1] (data)
|
||||
size_t inputDim = inDims.m_width * inDims.m_height * inDims.m_numChannels;
|
||||
if (Input(1)->GetNumRows() == 0)
|
||||
ValidateInferInputDims(1, inputDim, Input(1)->GetNumCols());
|
||||
|
||||
if (isFinalValidationPass && Input(1)->GetNumRows() != inputDim)
|
||||
LogicError("Each column of input to the convolution node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels.", NodeName().c_str(), (int)inputDim);
|
||||
LogicError("Each column of inDims to the convolution node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels.", NodeName().c_str(), (int)inputDim);
|
||||
|
||||
// that's our dimension
|
||||
SetDims(outputSampleLayout, Input(1)->GetNumCols());
|
||||
SetDims(outDims.AsTensorShape(m_imageLayoutKind), Input(1)->GetNumCols());
|
||||
|
||||
// set up the various engines and descriptor objects
|
||||
// REVIEW alexeyk: is there a better place to create engines?
|
||||
if (m_factory == nullptr)
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId);
|
||||
if (m_convEng == nullptr)
|
||||
m_convEng = m_factory->CreateConvEngine(m_deviceId, m_maxTempMemSizeInSamples);
|
||||
if (m_inT == nullptr)
|
||||
m_inT = m_factory->CreateTensor(inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels(), 1);
|
||||
if (m_filterT == nullptr)
|
||||
m_filterT = m_factory->CreateFilter(m_kernelWidth, m_kernelHeight, inputSampleLayout.GetNumChannels(), m_sampleLayout.GetNumChannels());
|
||||
if (m_outT == nullptr)
|
||||
m_outT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
|
||||
if (m_convDesc == nullptr)
|
||||
m_convDesc = m_factory->CreateConvDescriptor(*m_inT, *m_filterT, m_horizontalSubsample, m_verticalSubsample, m_zeroPadding);
|
||||
// REVIEW alexeyk: create per-channel (shared) bias. Consider adding other types of biases.
|
||||
if (m_biasT == nullptr)
|
||||
m_biasT = m_factory->CreateTensor(1, 1, m_sampleLayout.GetNumChannels(), 1);
|
||||
if (isFinalValidationPass)
|
||||
{
|
||||
// set up the various engines and descriptor objects
|
||||
// REVIEW alexeyk: is there a better place to create engines?
|
||||
assert(m_factory);
|
||||
//if (m_factory == nullptr)
|
||||
// m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
// TODO: This seems to expose too much internal knowlegde of the engine to the ConvolutionNode().
|
||||
// Why not just pass everything to the engine creator, and get one object that holds everything.
|
||||
if (m_convEng == nullptr)
|
||||
m_convEng = m_factory->CreateConvEngine(m_deviceId, m_maxTempMemSizeInSamples);
|
||||
if (m_inT == nullptr)
|
||||
m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
|
||||
if (m_filterT == nullptr)
|
||||
m_filterT = m_factory->CreateFilter(m_kernelWidth, m_kernelHeight, inDims.m_numChannels, m_outputChannels);
|
||||
if (m_outT == nullptr)
|
||||
m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
|
||||
if (m_convDesc == nullptr)
|
||||
m_convDesc = m_factory->CreateConvDescriptor(*m_inT, *m_filterT, m_horizontalSubsample, m_verticalSubsample, m_zeroPadding);
|
||||
// REVIEW alexeyk: create per-channel bias (shared across all pixels). Consider adding other types of biases.
|
||||
if (m_biasT == nullptr)
|
||||
m_biasT = m_factory->CreateTensor(1, 1, outDims.m_numChannels, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void DumpNodeInfo(const bool printValues, File& fstream) const override
|
||||
{
|
||||
Base::DumpNodeInfo(printValues, fstream);
|
||||
|
||||
auto inputSampleLayout = GetInputSampleLayout(1);
|
||||
auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);
|
||||
auto outDims = ImageDimensions(m_sampleLayout, m_imageLayoutKind);
|
||||
|
||||
char str[4096];
|
||||
sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu] \n", inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels());
|
||||
sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu] \n", inDims.m_width, inDims.m_height, inDims.m_numChannels);
|
||||
fstream << string(str);
|
||||
sprintf(str, "Kernel[Width:%lu, Height:%lu] SubSample[Horizontal:%lu, Vertical:%lu]\n", m_kernelWidth, m_kernelHeight, m_horizontalSubsample, m_verticalSubsample);
|
||||
fstream << string(str);
|
||||
sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu] \n", m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels());
|
||||
sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu] \n", outDims.m_width, outDims.m_height, outDims.m_numChannels);
|
||||
fstream << string(str);
|
||||
sprintf(str, "ZeroPadding=%ls maxTempMemSizeInSamples=%lu\n", m_zeroPadding? L"true" : L"false", m_maxTempMemSizeInSamples);
|
||||
sprintf(str, "zeroPadding=%ls maxTempMemSizeInSamples=%lu\n", m_zeroPadding? L"true" : L"false", m_maxTempMemSizeInSamples);
|
||||
fstream << string(str);
|
||||
}
|
||||
|
||||
|
@ -273,6 +304,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
private:
|
||||
size_t m_outputChannels;
|
||||
size_t m_kernelWidth, m_kernelHeight;
|
||||
size_t m_horizontalSubsample, m_verticalSubsample;
|
||||
bool m_zeroPadding;
|
||||
bool m_1DConvolutionOnGPUSparse;
|
||||
|
||||
shared_ptr<Matrix<ElemType>> m_tempMatrix;
|
||||
size_t m_maxTempMemSizeInSamples; // can change during runtime
|
||||
|
||||
ImageLayoutKind m_imageLayoutKind; // how to interpret the tensor (which dimensions are X/Y and C)
|
||||
|
||||
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
|
||||
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
|
||||
|
||||
|
@ -281,14 +323,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
std::unique_ptr<ConvolutionTensor4D> m_outT;
|
||||
std::unique_ptr<ConvolutionDescriptor> m_convDesc;
|
||||
std::unique_ptr<ConvolutionTensor4D> m_biasT;
|
||||
|
||||
size_t m_kernelWidth, m_kernelHeight;
|
||||
size_t m_horizontalSubsample, m_verticalSubsample;
|
||||
bool m_zeroPadding;
|
||||
bool m_1DConvolutionOnGPUSparse;
|
||||
|
||||
shared_ptr<Matrix<ElemType>> m_tempMatrix;
|
||||
size_t m_maxTempMemSizeInSamples; // can change during runtime
|
||||
};
|
||||
|
||||
template class ConvolutionNode<float>;
|
||||
|
@ -298,8 +332,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// PoolingNodeBase (input)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// Max/Average Pooling: support multi channel
|
||||
// Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
|
||||
template<class ElemType>
|
||||
class PoolingNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
|
||||
{
|
||||
|
@ -308,17 +340,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name),
|
||||
m_windowWidth(SIZE_MAX), m_windowHeight(SIZE_MAX),
|
||||
m_horizontalSubsample(SIZE_MAX), m_verticalSubsample(SIZE_MAX)
|
||||
m_horizontalSubsample(SIZE_MAX), m_verticalSubsample(SIZE_MAX),
|
||||
m_imageLayoutKind(ImageLayoutKind::HWC)
|
||||
{ }
|
||||
PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) :
|
||||
PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind) :
|
||||
Base(deviceId, name),
|
||||
m_windowWidth(windowWidth), m_windowHeight(windowHeight),
|
||||
m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample)
|
||||
m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample),
|
||||
m_imageLayoutKind(imageLayoutKind)
|
||||
{
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId);
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
}
|
||||
PoolingNodeBase(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
PoolingNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"))
|
||||
PoolingNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")))
|
||||
{
|
||||
// input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample
|
||||
AttachInputs(configp, this->GetExpectedNumInputs());
|
||||
|
@ -327,13 +361,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void Save(File& fstream) const override
|
||||
{
|
||||
Base::Save(fstream);
|
||||
fstream << m_windowWidth << m_windowHeight << m_horizontalSubsample << m_verticalSubsample;
|
||||
uint32_t imageLayoutKind = (uint32_t)m_imageLayoutKind;
|
||||
uint32_t windowWidth = (uint32_t)m_windowWidth;
|
||||
fstream << windowWidth << imageLayoutKind << m_windowHeight << m_horizontalSubsample << m_verticalSubsample;
|
||||
}
|
||||
|
||||
void Load(File& fstream, size_t modelVersion) override
|
||||
{
|
||||
Base::Load(fstream, modelVersion);
|
||||
fstream >> m_windowWidth >> m_windowHeight >> m_horizontalSubsample >> m_verticalSubsample;
|
||||
uint32_t imageLayoutKind, windowWidth;
|
||||
fstream >> windowWidth >> imageLayoutKind >> m_windowHeight >> m_horizontalSubsample >> m_verticalSubsample;
|
||||
m_windowWidth = windowWidth;
|
||||
m_imageLayoutKind = (ImageLayoutKind)imageLayoutKind;
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
}
|
||||
|
||||
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
||||
|
@ -351,6 +391,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
node->m_inputSizePerSample = m_inputSizePerSample;
|
||||
node->m_outputSizePerSample = m_outputSizePerSample;
|
||||
|
||||
node->m_imageLayoutKind = m_imageLayoutKind;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -388,20 +430,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase();
|
||||
|
||||
// get input tensor shape
|
||||
auto inputSampleLayout = GetInputSampleLayout(0);
|
||||
// get input tensor shape and interpret as image dimensions
|
||||
auto inDims = ImageDimensions(GetInputSampleLayout(0), m_imageLayoutKind);
|
||||
|
||||
if (inputSampleLayout.GetWidth() < m_windowWidth || inputSampleLayout.GetHeight() < m_windowHeight)
|
||||
if (isFinalValidationPass && (inDims.m_width < m_windowWidth || inDims.m_height < m_windowHeight))
|
||||
InvalidArgument("PoolingNodeBase: inputWidth must >= windowWidth and inputHeight must >= windowHeight.");
|
||||
|
||||
// determine output tensor shape
|
||||
auto outputSampleLayout = ImageLayoutWHC(
|
||||
(inputSampleLayout.GetWidth() - m_windowWidth) / m_horizontalSubsample + 1,
|
||||
(inputSampleLayout.GetHeight() - m_windowHeight) / m_verticalSubsample + 1,
|
||||
inputSampleLayout.GetNumChannels());
|
||||
auto outDims = ImageDimensions(
|
||||
(inDims.m_width - m_windowWidth) / m_horizontalSubsample + 1,
|
||||
(inDims.m_height - m_windowHeight) / m_verticalSubsample + 1,
|
||||
inDims.m_numChannels);
|
||||
|
||||
m_inputSizePerSample = inputSampleLayout.GetWidth() * inputSampleLayout.GetHeight() * inputSampleLayout.GetNumChannels();
|
||||
//m_outputSizePerSample = outputSampleLayout.GetWidth() * outputSampleLayout.GetHeight() * outputSampleLayout.GetNumChannels();
|
||||
m_inputSizePerSample = inDims.m_width * inDims.m_height * inDims.m_numChannels;
|
||||
|
||||
if (Input(0)->GetNumRows() == 0)
|
||||
ValidateInferInputDims(0, m_inputSizePerSample, Input(0)->GetNumCols()); // TODO: We should infer a tensor dimension for the input instead.
|
||||
|
@ -409,18 +450,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (isFinalValidationPass && Input(0)->GetNumRows() != m_inputSizePerSample) // TODO: Can be removed once tensor shape and numRows are perfectly in sync.
|
||||
LogicError("each column of input to the MaxPooling node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels", NodeName().c_str(), (int)m_inputSizePerSample);
|
||||
|
||||
SetDims(outputSampleLayout, Input(0)->GetNumCols());
|
||||
SetDims(outDims.AsTensorShape(m_imageLayoutKind), Input(0)->GetNumCols());
|
||||
|
||||
// set up various engines and descriptor objects
|
||||
// REVIEW alexeyk: is there a better place to create engines?
|
||||
if (m_factory == nullptr)
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId);
|
||||
if (m_poolEng == nullptr)
|
||||
m_poolEng = m_factory->CreatePoolEngine(m_deviceId);
|
||||
if (m_inT == nullptr)
|
||||
m_inT = m_factory->CreateTensor(inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels(), 1);
|
||||
if (m_outT == nullptr)
|
||||
m_outT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
|
||||
if (isFinalValidationPass)
|
||||
{
|
||||
// set up various engines and descriptor objects
|
||||
// REVIEW alexeyk: is there a better place to create engines?
|
||||
assert(m_factory);
|
||||
//if (m_factory == nullptr)
|
||||
// m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
if (m_poolEng == nullptr)
|
||||
m_poolEng = m_factory->CreatePoolEngine(m_deviceId);
|
||||
if (m_inT == nullptr)
|
||||
m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
|
||||
if (m_outT == nullptr)
|
||||
m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void DumpNodeInfo(const bool printValues, File& fstream) const override
|
||||
|
@ -430,27 +475,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
auto inputSampleLayout = GetInputSampleLayout(0);
|
||||
|
||||
char str[4096];
|
||||
sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu] \n", inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels());
|
||||
sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu] \n", inputSampleLayout[1], inputSampleLayout[2], inputSampleLayout[0]);
|
||||
fstream << string(str);
|
||||
sprintf(str, "PoolingWindow[Width:%lu, Height:%lu] SubSampling[Horizontal:%lu, Vertical:%lu]\n", m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample);
|
||||
fstream << string(str);
|
||||
sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu] \n", m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels());
|
||||
sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu] \n", m_sampleLayout[1], m_sampleLayout[2], m_sampleLayout[0]);
|
||||
fstream << string(str);
|
||||
sprintf(str, "TotalSizePerSample[Input:%lu, Output:%lu] \n", m_inputSizePerSample, m_outputSizePerSample);
|
||||
fstream << string(str);
|
||||
}
|
||||
|
||||
protected:
|
||||
size_t m_windowWidth, m_windowHeight;
|
||||
size_t m_horizontalSubsample, m_verticalSubsample;
|
||||
size_t m_inputSizePerSample, m_outputSizePerSample;
|
||||
|
||||
ImageLayoutKind m_imageLayoutKind; // how to interpret the tensor (which dimensions are X/Y and C)
|
||||
|
||||
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
|
||||
std::unique_ptr<PoolingEngine<ElemType>> m_poolEng;
|
||||
|
||||
std::unique_ptr<ConvolutionTensor4D> m_inT;
|
||||
std::unique_ptr<ConvolutionTensor4D> m_outT;
|
||||
std::unique_ptr<PoolingDescriptor> m_poolDesc;
|
||||
|
||||
size_t m_windowWidth, m_windowHeight;
|
||||
size_t m_horizontalSubsample, m_verticalSubsample;
|
||||
size_t m_inputSizePerSample, m_outputSizePerSample;
|
||||
};
|
||||
|
||||
// add this at the start of each derived class, to get access to the members of ComputationNode
|
||||
|
@ -471,8 +518,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
static const std::wstring TypeName() { return L"MaxPooling"; }
|
||||
public:
|
||||
MaxPoolingNode(DEVICEID_TYPE deviceId, const wstring & name) : Base(deviceId, name) { }
|
||||
MaxPoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) :
|
||||
Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample)
|
||||
MaxPoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind) :
|
||||
Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind)
|
||||
{ }
|
||||
MaxPoolingNode(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
Base(configp)
|
||||
|
@ -481,7 +528,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
if (m_poolDesc == nullptr)
|
||||
if (isFinalValidationPass && m_poolDesc == nullptr)
|
||||
m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Max, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
|
||||
}
|
||||
};
|
||||
|
@ -500,8 +547,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
static const std::wstring TypeName() { return L"AveragePooling"; }
|
||||
public:
|
||||
AveragePoolingNode(DEVICEID_TYPE deviceId, const wstring & name) : Base(deviceId, name) { }
|
||||
AveragePoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) :
|
||||
Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample)
|
||||
AveragePoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind) :
|
||||
Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind)
|
||||
{ }
|
||||
AveragePoolingNode(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
Base(configp)
|
||||
|
@ -525,7 +572,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
if (m_poolDesc == nullptr)
|
||||
if (isFinalValidationPass && m_poolDesc == nullptr)
|
||||
m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Average, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
|
||||
}
|
||||
};
|
||||
|
@ -573,6 +620,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
// Read and check version.
|
||||
// REVIEW alexeyk: extract version checking so it can be re-used in other places.
|
||||
// BUGBUG: We must serialize m_inputLayout.
|
||||
int32_t verWritten;
|
||||
int32_t verReadable;
|
||||
fstream >> verWritten >> verReadable;
|
||||
|
@ -683,18 +731,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
SetDims(Input(0));
|
||||
|
||||
if (m_factory == nullptr)
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId);
|
||||
if (m_convEng == nullptr)
|
||||
m_convEng = m_factory->CreateConvEngine(m_deviceId, 0);
|
||||
if (m_inT == nullptr)
|
||||
m_inT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
|
||||
if (m_scaleBiasT == nullptr)
|
||||
if (isFinalValidationPass)
|
||||
{
|
||||
if (m_spatial)
|
||||
m_scaleBiasT = m_factory->CreateTensor(1, 1, m_sampleLayout.GetNumChannels(), 1);
|
||||
else
|
||||
m_scaleBiasT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
|
||||
const auto m_imageLayoutKind = ImageLayoutKind::CHW; // BUGBUG: Finish this. Must be serialized.
|
||||
auto dims = ImageDimensions(GetSampleLayout(), m_imageLayoutKind);
|
||||
|
||||
if (m_factory == nullptr)
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
if (m_convEng == nullptr)
|
||||
m_convEng = m_factory->CreateConvEngine(m_deviceId, 0);
|
||||
if (m_inT == nullptr)
|
||||
m_inT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
|
||||
if (m_scaleBiasT == nullptr)
|
||||
{
|
||||
if (m_spatial)
|
||||
m_scaleBiasT = m_factory->CreateTensor(1, 1, dims.m_numChannels, 1);
|
||||
else
|
||||
m_scaleBiasT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -740,11 +794,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
VersionInfo m_version;
|
||||
|
||||
private:
|
||||
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
|
||||
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
|
||||
std::unique_ptr<ConvolutionTensor4D> m_inT;
|
||||
std::unique_ptr<ConvolutionTensor4D> m_scaleBiasT;
|
||||
|
||||
// Determines whether to use training or inference(evaluation) mode.
|
||||
bool m_eval;
|
||||
// Determines whether to use per-activation (used after non-convolutional layers like fully connected)
|
||||
|
@ -760,6 +809,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
shared_ptr<Matrix<ElemType>> m_dScale;
|
||||
// Stores bias derivatives.
|
||||
shared_ptr<Matrix<ElemType>> m_dBias;
|
||||
|
||||
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
|
||||
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
|
||||
std::unique_ptr<ConvolutionTensor4D> m_inT;
|
||||
std::unique_ptr<ConvolutionTensor4D> m_scaleBiasT;
|
||||
};
|
||||
|
||||
template class BatchNormalizationNode<float>;
|
||||
|
|
|
@ -18,6 +18,635 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// This header collects special-purpose nodes.
|
||||
// It is likely that these are no longer functional.
|
||||
|
||||
#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
|
||||
// -----------------------------------------------------------------------
|
||||
// ScaleNode (scalar scaling factor, matrix)
|
||||
//
|
||||
// Identical to ElementTimesNode with tensor lib (broadcasting). Can be removed.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class ScaleNode : public ComputationNode<ElemType>, public NumInputs<2>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName() { return L"Scale"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(ScaleNode);
|
||||
ScaleNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
#ifdef ENABLE_TENSORVIEW // This takes a big perf hit since our reduction uses only a single thread in this case. Needs to be fixed.
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto gradient = GradientTensorFor(rank, fr);
|
||||
auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
|
||||
auto otherInputValue = Input(1 - inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
|
||||
// if reduction then mask the respective input(s) (zero out the gaps)
|
||||
if (Input(inputIndex)->GetNumCols() < GetNumCols())
|
||||
MaskMissingGradientColumnsToZero(fr);
|
||||
if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
|
||||
Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
|
||||
|
||||
inputGradient.AddElementwiseProductOf(gradient, otherInputValue);
|
||||
#else
|
||||
if (inputIndex == 0) // left derivative
|
||||
{
|
||||
// this is a reduction over frames, so we must mask gaps to zero
|
||||
Input(0)->Gradient() += Matrix<ElemType>::InnerProductOfMatrices(MaskedGradientFor(fr), Input(1)->MaskedValueFor(fr)); // element-wise product summed up over all
|
||||
}
|
||||
else if (inputIndex == 1) // right derivative
|
||||
{
|
||||
Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
|
||||
Matrix<ElemType>::Multiply1x1AndWeightedAdd(+1.0f, Input(0)->Value()/*1x1*/, GradientFor(fr), 1.0f, sliceInput1Grad);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The ScaleNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
#ifdef ENABLE_TENSORVIEW
|
||||
static int c = 0; if (c++ == 0) { fprintf(stderr, "#SCALE#\n"); }
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto result = ValueTensorFor(rank, fr);
|
||||
auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
result.AssignElementwiseProductOf(input0, input1);
|
||||
#else
|
||||
ValueFor(fr).Assign1x1ProductOf(Input(0)->Value()/*1x1*/, Input(1)->ValueFor(fr));
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase();
|
||||
|
||||
// left node must be a scalar
|
||||
if (isFinalValidationPass && (Input(0)->GetNumRows() != 1 || Input(0)->GetNumCols() != 1))
|
||||
RuntimeError("The left value of ScaleNode must be a scalar value.");
|
||||
|
||||
SetDims(Input(1));
|
||||
}
|
||||
};
|
||||
|
||||
template class ScaleNode<float>;
|
||||
template class ScaleNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// RowElementTimesNode (left, right) --TODO: what are left and right?
|
||||
//
|
||||
// TODO: This is subsumed by ElementTimes with tensor lib.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class RowElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName() { return L"RowElementTimes"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(RowElementTimesNode);
|
||||
RowElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
void BackpropToMap(const size_t inputIndex)
|
||||
{
|
||||
if (inputIndex > 1)
|
||||
InvalidArgument("RowElementTimes operation only takes two inputs.");
|
||||
|
||||
if (inputIndex == 0)
|
||||
{
|
||||
BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
|
||||
}
|
||||
else
|
||||
{
|
||||
BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
|
||||
Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
|
||||
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
|
||||
|
||||
Matrix<ElemType> sliceInput1Value = Input(1 - inputIndex)->ValueFor(fr);
|
||||
|
||||
if (inputIndex == 0)
|
||||
{
|
||||
BackpropToLeftS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
|
||||
}
|
||||
else
|
||||
{
|
||||
BackpropToRightS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The RowElementTimesNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
//left (input 0) is a matrix
|
||||
/*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
|
||||
Matrix<ElemType>& input0GradientValues,
|
||||
const Matrix<ElemType>& gradientValues,
|
||||
Matrix<ElemType>& tempMatrix)
|
||||
{
|
||||
tempMatrix.SetValue(gradientValues);
|
||||
tempMatrix.RowElementMultiplyWith(input1FunctionValues);
|
||||
input0GradientValues += tempMatrix;
|
||||
|
||||
#if NANCHECK
|
||||
input0GradientValues.HasNan("RowElementTimes");
|
||||
#endif
|
||||
}
|
||||
|
||||
//right (input 1) is a row vector
|
||||
/*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
|
||||
Matrix<ElemType>& input1GradientValues,
|
||||
const Matrix<ElemType>& gradientValues,
|
||||
Matrix<ElemType>& tempMatrix)
|
||||
{
|
||||
tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, true);
|
||||
input1GradientValues += tempMatrix;
|
||||
|
||||
#if NANCHECK
|
||||
input1GradientValues.HasNan("RowElementTimes");
|
||||
#endif
|
||||
}
|
||||
void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
|
||||
{
|
||||
ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
//if (fr.IsAllFrames()) { ForwardPropMap(); return; }
|
||||
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
ForwardPropS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
|
||||
}
|
||||
|
||||
/*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
|
||||
{
|
||||
functionValues.SetValue(input0);
|
||||
functionValues.RowElementMultiplyWith(input1);
|
||||
|
||||
#if NANCHECK
|
||||
functionValues.HasNan("RowElementTimes");
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase();
|
||||
|
||||
size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
|
||||
size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); rows0;
|
||||
if (isFinalValidationPass && cols0 != cols1 || rows1 != 1)
|
||||
LogicError("RowElementTimes: Either the second operand is not a row vector or the number of columns of operands does not match.");
|
||||
|
||||
SetDims(Input(0));
|
||||
}
|
||||
|
||||
//request matrices that are needed for gradient computation
|
||||
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::RequestMatricesBeforeBackprop(matrixPool);
|
||||
RequestMatrixFromPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
//release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
||||
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
private:
|
||||
shared_ptr<Matrix<ElemType>> m_tempMatrix;
|
||||
};
|
||||
|
||||
template class RowElementTimesNode<float>;
|
||||
template class RowElementTimesNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ColumnElementTimesNode (left, right) --TODO: what are left and right?
|
||||
//
|
||||
// TODO: This is subsumed by ElementTimes with tensor lib.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class ColumnElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName() { return L"ColumnElementTimes"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(ColumnElementTimesNode);
|
||||
ColumnElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
void BackpropToMap(const size_t inputIndex)
|
||||
{
|
||||
if (inputIndex > 1)
|
||||
InvalidArgument("ColumnElementTimes operation only takes two inputs.");
|
||||
|
||||
if (inputIndex == 0)
|
||||
{
|
||||
BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
|
||||
}
|
||||
else
|
||||
{
|
||||
BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
|
||||
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
|
||||
|
||||
if (inputIndex == 0)
|
||||
{
|
||||
Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
|
||||
|
||||
BackpropToLeftS(Input(1)->Value(), sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
|
||||
}
|
||||
else
|
||||
{
|
||||
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
|
||||
BackpropToRightS(sliceInput0Value, Input(1)->Gradient(), sliceOutputGrad, *m_tempMatrix);
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The ColumnElementTimesNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
//left (input 0) is a matrix
|
||||
/*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
|
||||
Matrix<ElemType>& input0GradientValues,
|
||||
const Matrix<ElemType>& gradientValues,
|
||||
Matrix<ElemType>& tempMatrix)
|
||||
{
|
||||
tempMatrix.SetValue(gradientValues);
|
||||
tempMatrix.ColumnElementMultiplyWith(input1FunctionValues);
|
||||
input0GradientValues += tempMatrix;
|
||||
|
||||
#if NANCHECK
|
||||
input0GradientValues.HasNan("ColumnElementTimes");
|
||||
#endif
|
||||
}
|
||||
|
||||
//right (input 1) is a col vector
|
||||
/*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
|
||||
Matrix<ElemType>& input1GradientValues,
|
||||
const Matrix<ElemType>& gradientValues,
|
||||
Matrix<ElemType>& tempMatrix)
|
||||
{
|
||||
tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, false);
|
||||
input1GradientValues += tempMatrix;
|
||||
|
||||
#if NANCHECK
|
||||
input1GradientValues.HasNan("ColumnElementTimes");
|
||||
#endif
|
||||
}
|
||||
void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
|
||||
{
|
||||
ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
//if (fr.IsAllFrames()) { ForwardPropMap(); return; }
|
||||
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value());
|
||||
}
|
||||
|
||||
/*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
|
||||
{
|
||||
functionValues.SetValue(input0);
|
||||
functionValues.ColumnElementMultiplyWith(input1);
|
||||
|
||||
#if NANCHECK
|
||||
functionValues.HasNan("ColumnElementTimes");
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase();
|
||||
|
||||
//derive number of rows if possible
|
||||
for (size_t index = 0; index < 2; index++)
|
||||
{
|
||||
size_t rows = Input(index)->GetNumRows() == 0 ? Input(1 - index)->GetNumRows() : Input(index)->GetNumRows();
|
||||
size_t cols = Input(index)->GetNumCols() == 0 ? Input(1 - index)->GetNumCols() : Input(index)->GetNumCols();
|
||||
ValidateInferInputDims(index, rows, cols);
|
||||
}
|
||||
|
||||
size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
|
||||
size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); cols0;
|
||||
if (isFinalValidationPass && (rows0 != rows1 || cols1 != 1))
|
||||
LogicError("ColumnElementTimes: Either the second operand is not a column vector or the number of rows of operands does not match.");
|
||||
|
||||
SetDims(Input(0));
|
||||
}
|
||||
|
||||
//request matrices that are needed for gradient computation
|
||||
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::RequestMatricesBeforeBackprop(matrixPool);
|
||||
RequestMatrixFromPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
//release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
||||
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
private:
|
||||
shared_ptr<Matrix<ElemType>> m_tempMatrix;
|
||||
};
|
||||
|
||||
template class ColumnElementTimesNode<float>;
|
||||
template class ColumnElementTimesNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// RectifiedLinearNode (input) -- ReLU non-linearity
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class RectifiedLinearNode : public SoftmaxNodeBase<ElemType>
|
||||
{
|
||||
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"RectifiedLinear"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(RectifiedLinearNode);
|
||||
RectifiedLinearNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override
|
||||
{
|
||||
gradient.AssignLinearRectifierDerivativeOf(inputFunctionValues);
|
||||
#if DUMPOUTPUT
|
||||
inputGradientValues.Print("RecitifiedLinearNode-Partial-in");
|
||||
#endif
|
||||
inputGradientValues.AddElementProductOf(gradientValues, gradient);
|
||||
#if DUMPOUTPUT
|
||||
inputGradientValues.Print("RecitifiedLinearNode-Partial-out");
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The ReLU node does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignTruncateBottomOf(inputFunctionValues, 0);
|
||||
#if DUMPOUTPUT
|
||||
functionValues.Print("RectifiedLinearNode");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template class RectifiedLinearNode<float>;
|
||||
template class RectifiedLinearNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// SigmoidNode (input) -- sigmoid non-linearity
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class SigmoidNode : public SoftmaxNodeBase<ElemType>
|
||||
{
|
||||
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Sigmoid"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(SigmoidNode);
|
||||
SigmoidNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
|
||||
{
|
||||
// The Sigmoid node does not require any of it's input's values for computing
|
||||
// the gradients of its input nodes
|
||||
UNREFERENCED_PARAMETER(childIndex);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
|
||||
{
|
||||
gradient.AssignSigmoidDerivativeOf(functionValues);
|
||||
inputGradientValues.AddElementProductOf(gradientValues, gradient);
|
||||
}
|
||||
|
||||
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignSigmoidOf(inputFunctionValues);
|
||||
}
|
||||
};
|
||||
|
||||
template class SigmoidNode<float>;
|
||||
template class SigmoidNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// TanhNode (input) -- tanh non-linearity
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class TanhNode : public SoftmaxNodeBase<ElemType>
|
||||
{
|
||||
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Tanh"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(TanhNode);
|
||||
TanhNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
|
||||
{
|
||||
// The plus node does not require any of it's input's values for computing
|
||||
// the gradients of its input nodes
|
||||
UNREFERENCED_PARAMETER(childIndex);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
|
||||
{
|
||||
gradient.AssignElementProductOf(functionValues, functionValues); // v .* v
|
||||
gradient.AssignDifferenceOf(1, gradient); // 1-v^2
|
||||
|
||||
inputGradientValues.AddElementProductOf(gradientValues, gradient); // += d .* ((1-v) .* v))
|
||||
}
|
||||
|
||||
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignTanhOf(inputFunctionValues);
|
||||
}
|
||||
};
|
||||
|
||||
template class TanhNode<float>;
|
||||
template class TanhNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// LogNode (input) -- component-wise log() of input
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class LogNode : public SoftmaxNodeBase<ElemType>
|
||||
{
|
||||
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Log"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(LogNode);
|
||||
LogNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The plus node does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
|
||||
{
|
||||
gradient.AssignElementInverseOf(inputFunctionValues); // 1/x (x is input to log(x))
|
||||
inputGradientValues.AddElementProductOf(gradientValues, gradient);
|
||||
// TODO: with tensor lib:
|
||||
//inputGradientValues.AddElementDivisionOf(gradientValues, inputFunctionValues); // 1/x (x is input to log(x))
|
||||
}
|
||||
|
||||
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignLogOf(inputFunctionValues);
|
||||
}
|
||||
};
|
||||
|
||||
template class LogNode<float>;
|
||||
template class LogNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ExpNode (input) -- component-wise exp() of input
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class ExpNode : public SoftmaxNodeBase<ElemType>
|
||||
{
|
||||
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Exp"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(ExpNode);
|
||||
ExpNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
assert(inputIndex == 0); inputIndex;
|
||||
|
||||
Matrix<ElemType> sliceInputGrad = Input(0)->GradientFor(fr);
|
||||
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
|
||||
Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
|
||||
|
||||
m_gradientTemp->AssignExpOf(sliceInputValue); // Exp(x) is its own partial
|
||||
sliceInputGrad.AddElementProductOf(sliceOutputGrad, *m_gradientTemp);
|
||||
// TODO: with tensor lib:
|
||||
// sliceInputGrad.AddElementProductOf(sliceOutputGrad, functionValues);
|
||||
// and set OutputUsed
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The ExpNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override { NOT_IMPLEMENTED; } // not needed
|
||||
|
||||
void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignExpOf(inputFunctionValues);
|
||||
}
|
||||
};
|
||||
|
||||
template class ExpNode<float>;
|
||||
template class ExpNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// CosineNode (input) -- component-wise cos() of input
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class CosineNode : public SoftmaxNodeBase<ElemType>
|
||||
{
|
||||
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Cosine"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(CosineNode);
|
||||
CosineNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The CosineNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
|
||||
{
|
||||
gradient.AssignNegativeSineOf(inputFunctionValues); // -sin(x) (x is input to Cosine(x))
|
||||
inputGradientValues.AddElementProductOf(gradientValues, gradient);
|
||||
// TODO: tensor lib: make a joint kernel, since neg sin is never used for anything else
|
||||
}
|
||||
|
||||
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignCosineOf(inputFunctionValues);
|
||||
}
|
||||
};
|
||||
|
||||
template class CosineNode<float>;
|
||||
template class CosineNode<double>;
|
||||
#endif
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
/// DummyCriterionNode (objectives, derivatives, prediction)
|
||||
// -----------------------------------------------------------------------
|
||||
|
|
|
@ -28,6 +28,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// -----------------------------------------------------------------------
|
||||
// LearnableParameter (/*no input*/)
|
||||
// represents weight matrices and biases
|
||||
// TODO: add -Node to the class name
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
|
@ -42,18 +43,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_parameterUpdateRequired = true;
|
||||
SetDims(TensorShape(), 0);
|
||||
}
|
||||
LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
|
||||
LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & shape) :
|
||||
Base(deviceId, name)
|
||||
{
|
||||
m_parameterUpdateRequired = true;
|
||||
CreateMatrixIfNull(m_value);
|
||||
SetDims(TensorShape(rows), cols);
|
||||
// for now we split off the trailing dimension into the matrix column dimension
|
||||
// TODO: This is for compat, but is is inconsistent. Decide what a sample layout means for a node without MBLayout w.r.t. non-tensor ops.
|
||||
auto dims = shape.GetDims();
|
||||
size_t cols = 1;
|
||||
if (dims.size() > 1)
|
||||
{
|
||||
cols = dims.back();
|
||||
dims.resize(dims.size()-1);
|
||||
}
|
||||
SetDims(TensorShape(dims), cols);
|
||||
UpdateFunctionValuesSize(); // this allocates the matrix
|
||||
Value().SetValue(0);
|
||||
}
|
||||
LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
|
||||
LearnableParameter(deviceId, name, TensorShape(rows, cols))
|
||||
{ }
|
||||
LearnableParameter(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
LearnableParameter(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"rows"), configp->Get(L"cols"))
|
||||
LearnableParameter(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"shape"))
|
||||
{
|
||||
// TODO: Change dimensions to take a generic tensor instead. That will be a (minor) breaking change that will require fix-ups when converting from NDL to BrainScript.
|
||||
AttachInputs(configp, this->GetExpectedNumInputs());
|
||||
// parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float])
|
||||
// TODO: "needGradient" should be renamed to better match m_parameterUpdateRequired
|
||||
|
@ -83,7 +97,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
Base::Save(fstream);
|
||||
fstream << m_parameterUpdateRequired;
|
||||
fstream << GetNumRows() << GetNumCols();
|
||||
fstream << (size_t)0/*#rows in a legacy file format*/ << GetNumCols();
|
||||
m_sampleLayout.Save(fstream);
|
||||
fstream << Value();
|
||||
}
|
||||
|
||||
|
@ -95,8 +110,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
fstream >> m_parameterUpdateRequired;
|
||||
fstream >> rows >> cols;
|
||||
|
||||
SetDims(TensorShape(rows), cols);
|
||||
TensorShape sampleLayout;
|
||||
if (rows != 0) // legacy file format
|
||||
sampleLayout = TensorShape(rows);
|
||||
else
|
||||
sampleLayout.Load(fstream, /*acceptLegacyFormat=*/true);
|
||||
LoadValue(fstream);
|
||||
SetDims(sampleLayout, cols); // note: call this after LoadValue() since LoadValue() overwrites m_sampleLayout
|
||||
}
|
||||
|
||||
// initialize with random numbers
|
||||
|
@ -106,13 +126,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
bool initOnCPUOnly) // if true then always init on CPU, making initialization consistent across both (for testing)
|
||||
{
|
||||
size_t inputSize = GetNumCols();
|
||||
//fprintf(stderr, "%d x %d: %d %ls\n", (int)GetNumRows(), (int)GetNumCols(), (int)randomSeed, NodeName().c_str());
|
||||
|
||||
// the random seed offset is set via the "randomSeedOffset" parameter in config
|
||||
if (initOnCPUOnly)
|
||||
m_value->TransferToDeviceIfNotThereAndNotAutoPlace(CPUDEVICE, true);
|
||||
if (uniformInit)
|
||||
{
|
||||
ElemType randRange = 0.05f * initValueScale; //initValueScale/sqrt(inputSize);
|
||||
// TODO: move these crazy extra factors out from here and into NDL, and make them visible in BS
|
||||
ElemType randRange = 0.05f * initValueScale;
|
||||
Value().SetUniformRandomValue(-randRange, randRange, randomSeed);
|
||||
}
|
||||
else
|
||||
|
@ -221,6 +243,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// InputValueBase (/*no input*/)
|
||||
// Base class for InputValue and SparseInputValue (typically fed by a DataReader)
|
||||
// this covers four types: (regular vs. image) x (non-sparse vs. sparse)
|
||||
// TODO: add -Node to the class names
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
|
@ -228,59 +251,47 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
|
||||
|
||||
void Init(const TensorShape & sampleLayout, size_t cols, bool isSparse)
|
||||
void Init(const TensorShape & sampleLayout, bool isSparse)
|
||||
{
|
||||
m_isSparse = isSparse;
|
||||
CreateMatrixIfNull(m_value);
|
||||
if (isSparse)
|
||||
ConvertToSparseMatrix();
|
||||
|
||||
SetDims(sampleLayout, cols);
|
||||
SetDims(sampleLayout, 0);
|
||||
UpdateFunctionValuesSize(); // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
|
||||
m_parameterUpdateRequired = false;
|
||||
}
|
||||
protected:
|
||||
InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout, bool isSparse) :
|
||||
Base(deviceId, name)
|
||||
{
|
||||
Init(sampleLayout, isSparse);
|
||||
}
|
||||
InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, bool isSparse) :
|
||||
InputValueBase(deviceId, name, TensorShape(rows), isSparse)
|
||||
{ }
|
||||
InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, bool isSparse) :
|
||||
Base(deviceId, name)
|
||||
{
|
||||
Init(TensorShape(), 0, isSparse);
|
||||
}
|
||||
InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols, bool isSparse) :
|
||||
Base(deviceId, name)
|
||||
{
|
||||
Init(TensorShape(rows), cols, isSparse);
|
||||
}
|
||||
InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout, size_t numImages, bool isSparse) :
|
||||
Base(deviceId, name)
|
||||
{
|
||||
size_t cols = numImages;
|
||||
Init(imageLayout, cols, isSparse);
|
||||
}
|
||||
InputValueBase(deviceId, name, TensorShape(), isSparse)
|
||||
{ }
|
||||
InputValueBase(const ScriptableObjects::IConfigRecordPtr configp, bool isSparse) :
|
||||
Base(configp->Get(L"deviceId"), L"<placeholder>")
|
||||
{
|
||||
AttachInputs(configp, this->GetExpectedNumInputs());
|
||||
bool isImage = configp->Get(L"isImage");
|
||||
if (!isImage)
|
||||
{
|
||||
size_t rows = configp->Get(L"rows");
|
||||
size_t cols = configp->Get(L"cols");
|
||||
Init(TensorShape(rows), cols, isSparse); // no tensor, just a vector
|
||||
}
|
||||
Init(configp->Get(L"shape"), isSparse);
|
||||
else
|
||||
{
|
||||
size_t cols = configp->Get(L"numImages"); // This is actually the MB size. --TODO: No need to specify it?
|
||||
Init(ImageLayoutWHC(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels")), cols, isSparse);
|
||||
}
|
||||
Init(ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))), isSparse);
|
||||
}
|
||||
public:
|
||||
|
||||
virtual void Save(File& fstream) const override
|
||||
{
|
||||
Base::Save(fstream);
|
||||
size_t rows = GetNumRows(); // using explicitly typed variables to be 100% symmetrical to Load()
|
||||
size_t cols = m_pMBLayout ? 0 : GetNumCols(); // if this Input depends on MB size, we write it as having 0 dimensions
|
||||
fstream << rows << cols;
|
||||
size_t rows = GetNumRows(); // using explicitly typed variables to be 100% symmetrical to Load()
|
||||
size_t colsDummy = 0; // This should not be saved. InputValues always are minibatches.
|
||||
fstream << rows << colsDummy;
|
||||
m_sampleLayout.Save(fstream);
|
||||
}
|
||||
|
||||
|
@ -288,13 +299,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
Base::Load(fstream, modelVersion);
|
||||
|
||||
size_t rows, cols;
|
||||
fstream >> rows >> cols;
|
||||
// some older files retained the #columns when saving, which is meaningless
|
||||
if (m_pMBLayout)
|
||||
cols = 0;
|
||||
size_t rows, colsDummy;
|
||||
fstream >> rows >> colsDummy;
|
||||
TensorShape sampleLayout;
|
||||
sampleLayout.Load(fstream);
|
||||
sampleLayout.Load(fstream, /*acceptLegacyFormat=*/true);
|
||||
// some older files may have inconsistent tensor information
|
||||
if (rows != sampleLayout.GetNumElements())
|
||||
{
|
||||
|
@ -302,7 +310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
NodeName().c_str(), string(sampleLayout).c_str(), (int)rows);
|
||||
sampleLayout = TensorShape(rows);
|
||||
}
|
||||
Init(sampleLayout, cols, m_isSparse);
|
||||
Init(sampleLayout, m_isSparse);
|
||||
}
|
||||
|
||||
// InputValue must not resize its inputs because that might destroy it. It should already have the correct size.
|
||||
|
@ -347,11 +355,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
InputValue(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name, false)
|
||||
{ }
|
||||
InputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
|
||||
Base(deviceId, name, rows, cols, false)
|
||||
InputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows) :
|
||||
Base(deviceId, name, rows, false)
|
||||
{ }
|
||||
InputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout, size_t numImages) :
|
||||
Base(deviceId, name, imageLayout, numImages, false)
|
||||
InputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout) :
|
||||
Base(deviceId, name, sampleLayout, false)
|
||||
{ }
|
||||
InputValue(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
Base(configp, false)
|
||||
|
@ -376,11 +384,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name, true)
|
||||
{ }
|
||||
SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
|
||||
Base(deviceId, name, rows, cols, true)
|
||||
SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows) :
|
||||
Base(deviceId, name, rows, true)
|
||||
{ }
|
||||
SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout, size_t numImages) :
|
||||
Base(deviceId, name, imageLayout, numImages, true)
|
||||
SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout) :
|
||||
Base(deviceId, name, imageLayout, true)
|
||||
{ }
|
||||
SparseInputValue(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
Base(configp, true)
|
||||
|
|
|
@ -6,10 +6,10 @@
|
|||
#pragma once
|
||||
|
||||
#include "Basics.h"
|
||||
#include "Matrix.h"
|
||||
#include "TensorView.h"
|
||||
#include "ComputationNode.h"
|
||||
#include "ConvolutionalNodes.h"
|
||||
#include "Matrix.h"
|
||||
#include "TensorView.h"
|
||||
|
||||
#include <unordered_set>
|
||||
#include <map>
|
||||
|
@ -44,7 +44,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
#ifdef ENABLE_TENSORVIEW
|
||||
// BUGBUG: This gives us a huge perf hit for Image/QuickE2E.
|
||||
static int c = 0; if (c++ == 0) { fprintf(stderr, "#PLUSBP#\n"); }
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto gradient = GradientTensorFor(rank, fr);
|
||||
auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
|
||||
|
@ -53,7 +53,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (Input(inputIndex)->GetNumCols() < GetNumCols())
|
||||
MaskMissingGradientColumnsToZero(fr);
|
||||
|
||||
inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
|
||||
inputGradient.AddCopyOf(gradient);
|
||||
#else
|
||||
Matrix<ElemType> gradientValues = GradientFor(fr);
|
||||
Matrix<ElemType> functionValues = ValueFor(fr);
|
||||
|
@ -124,11 +124,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
#ifdef ENABLE_TENSORVIEW
|
||||
static int c = 0; if (c++ == 0) { fprintf(stderr, "#PLUS#\n"); }
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto result = ValueTensorFor(rank, fr);
|
||||
auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
result.DoSumOf(0.0f, input0, input1, 1.0f);
|
||||
result.AssignSumOf(input0, input1);
|
||||
#else
|
||||
Matrix<ElemType> functionValues = ValueFor(fr);
|
||||
Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
|
||||
|
@ -223,10 +224,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (Input(inputIndex)->GetNumCols() < GetNumCols())
|
||||
MaskMissingGradientColumnsToZero(fr);
|
||||
|
||||
if (sign > 0)
|
||||
inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
|
||||
else
|
||||
inputGradient.DoDifferenceOf(0.0f, inputGradient, gradient, 1.0f);
|
||||
inputGradient.AddCopyOf(gradient, sign);
|
||||
#else
|
||||
Matrix<ElemType> gradientValues = GradientFor(fr);
|
||||
|
||||
|
@ -269,12 +267,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
#ifdef ENABLE_TENSORVIEW
|
||||
static int c = 0; if (c++ == 0) { fprintf(stderr,"#MINUS#"); }
|
||||
static int c = 0; if (c++ == 0) { fprintf(stderr,"#MINUS#\n"); }
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto result = ValueTensorFor(rank, fr);
|
||||
auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
result.DoDifferenceOf(0.0f, input0, input1, 1.0f);
|
||||
result.AssignDifferenceOf(input0, input1);
|
||||
#else
|
||||
Matrix<ElemType> functionValues = ValueFor(fr);
|
||||
Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
|
||||
|
@ -307,91 +305,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template class MinusNode<float>;
|
||||
template class MinusNode<double>;
|
||||
|
||||
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
|
||||
// -----------------------------------------------------------------------
|
||||
// ScaleNode (scalar scaling factor, matrix)
|
||||
//
|
||||
// Identical to ElementTimesNode with tensor lib (broadcasting). Can be removed.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class ScaleNode : public ComputationNode<ElemType>, public NumInputs<2>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName() { return L"Scale"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(ScaleNode);
|
||||
ScaleNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
#if 0//def ENABLE_TENSORVIEW // This takes a big perf hit since our reduction uses only a single thread in this case. Needs to be fixed.
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto gradient = GradientTensorFor(rank, fr);
|
||||
auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
|
||||
auto otherInputValue = Input(1 - inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
|
||||
// if reduction then mask the respective input(s) (zero out the gaps)
|
||||
if (Input(inputIndex)->GetNumCols() < GetNumCols())
|
||||
MaskMissingGradientColumnsToZero(fr);
|
||||
if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
|
||||
Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
|
||||
|
||||
inputGradient.DoElementwiseProductOf(1.0f/*add to*/, gradient, otherInputValue, 1.0f);
|
||||
#else
|
||||
if (inputIndex == 0) // left derivative
|
||||
{
|
||||
// this is a reduction over frames, so we must mask gaps to zero
|
||||
Input(0)->Gradient() += Matrix<ElemType>::InnerProductOfMatrices(MaskedGradientFor(fr), Input(1)->MaskedValueFor(fr)); // element-wise product summed up over all
|
||||
}
|
||||
else if (inputIndex == 1) // right derivative
|
||||
{
|
||||
Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
|
||||
Matrix<ElemType>::Multiply1x1AndWeightedAdd(+1.0f, Input(0)->Value()/*1x1*/, GradientFor(fr), 1.0f, sliceInput1Grad);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The ScaleNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
#ifdef ENABLE_TENSORVIEW
|
||||
static int c = 0; if (c++ == 0) { fprintf(stderr, "#SCALE#"); }
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto result = ValueTensorFor(rank, fr);
|
||||
auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
result.DoElementwiseProductOf(0.0f, input0, input1, 1.0f);
|
||||
#else
|
||||
ValueFor(fr).Assign1x1ProductOf(Input(0)->Value()/*1x1*/, Input(1)->ValueFor(fr));
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase();
|
||||
|
||||
// left node must be a scalar
|
||||
if (isFinalValidationPass && (Input(0)->GetNumRows() != 1 || Input(0)->GetNumCols() != 1))
|
||||
RuntimeError("The left value of ScaleNode must be a scalar value.");
|
||||
|
||||
SetDims(Input(1));
|
||||
}
|
||||
};
|
||||
|
||||
template class ScaleNode<float>;
|
||||
template class ScaleNode<double>;
|
||||
#endif
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// NegateNode (input)
|
||||
// computes the negative of its input
|
||||
|
@ -707,7 +620,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
|
||||
Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
|
||||
|
||||
inputGradient.DoElementwiseProductOf(1.0f/*add to*/, gradient, otherInputValue, 1.0f);
|
||||
inputGradient.AddElementwiseProductOf(gradient, otherInputValue);
|
||||
#else
|
||||
Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
|
||||
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
|
||||
|
@ -725,12 +638,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
#ifdef ENABLE_TENSORVIEW
|
||||
static int c = 0; if (c++ == 0) { fprintf(stderr,"#ETIMES#"); }
|
||||
static int c = 0; if (c++ == 0) { fprintf(stderr,"#ETIMES#\n"); }
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto result = ValueTensorFor(rank, fr);
|
||||
auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
result.DoElementwiseProductOf(0.0f, input0, input1, 1.0f);
|
||||
result.AssignElementwiseProductOf(input0, input1);
|
||||
#else
|
||||
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
|
||||
|
@ -745,303 +658,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template class ElementTimesNode<float>;
|
||||
template class ElementTimesNode<double>;
|
||||
|
||||
#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
|
||||
// -----------------------------------------------------------------------
|
||||
// RowElementTimesNode (left, right) --TODO: what are left and right?
|
||||
//
|
||||
// TODO: This is subsumed by ElementTimes with tensor lib.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class RowElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName() { return L"RowElementTimes"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(RowElementTimesNode);
|
||||
RowElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
void BackpropToMap(const size_t inputIndex)
|
||||
{
|
||||
if (inputIndex > 1)
|
||||
InvalidArgument("RowElementTimes operation only takes two inputs.");
|
||||
|
||||
if (inputIndex == 0)
|
||||
{
|
||||
BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
|
||||
}
|
||||
else
|
||||
{
|
||||
BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
|
||||
Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
|
||||
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
|
||||
|
||||
Matrix<ElemType> sliceInput1Value = Input(1 - inputIndex)->ValueFor(fr);
|
||||
|
||||
if (inputIndex == 0)
|
||||
{
|
||||
BackpropToLeftS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
|
||||
}
|
||||
else
|
||||
{
|
||||
BackpropToRightS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The RowElementTimesNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
//left (input 0) is a matrix
|
||||
/*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
|
||||
Matrix<ElemType>& input0GradientValues,
|
||||
const Matrix<ElemType>& gradientValues,
|
||||
Matrix<ElemType>& tempMatrix)
|
||||
{
|
||||
tempMatrix.SetValue(gradientValues);
|
||||
tempMatrix.RowElementMultiplyWith(input1FunctionValues);
|
||||
input0GradientValues += tempMatrix;
|
||||
|
||||
#if NANCHECK
|
||||
input0GradientValues.HasNan("RowElementTimes");
|
||||
#endif
|
||||
}
|
||||
|
||||
//right (input 1) is a row vector
|
||||
/*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
|
||||
Matrix<ElemType>& input1GradientValues,
|
||||
const Matrix<ElemType>& gradientValues,
|
||||
Matrix<ElemType>& tempMatrix)
|
||||
{
|
||||
tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, true);
|
||||
input1GradientValues += tempMatrix;
|
||||
|
||||
#if NANCHECK
|
||||
input1GradientValues.HasNan("RowElementTimes");
|
||||
#endif
|
||||
}
|
||||
void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
|
||||
{
|
||||
ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
//if (fr.IsAllFrames()) { ForwardPropMap(); return; }
|
||||
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
ForwardPropS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
|
||||
}
|
||||
|
||||
/*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
|
||||
{
|
||||
functionValues.SetValue(input0);
|
||||
functionValues.RowElementMultiplyWith(input1);
|
||||
|
||||
#if NANCHECK
|
||||
functionValues.HasNan("RowElementTimes");
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase();
|
||||
|
||||
size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
|
||||
size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); rows0;
|
||||
if (isFinalValidationPass && cols0 != cols1 || rows1 != 1)
|
||||
LogicError("RowElementTimes: Either the second operand is not a row vector or the number of columns of operands does not match.");
|
||||
|
||||
SetDims(Input(0));
|
||||
}
|
||||
|
||||
//request matrices that are needed for gradient computation
|
||||
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::RequestMatricesBeforeBackprop(matrixPool);
|
||||
RequestMatrixFromPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
//release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
||||
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
private:
|
||||
shared_ptr<Matrix<ElemType>> m_tempMatrix;
|
||||
};
|
||||
|
||||
template class RowElementTimesNode<float>;
|
||||
template class RowElementTimesNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ColumnElementTimesNode (left, right) --TODO: what are left and right?
|
||||
//
|
||||
// TODO: This is subsumed by ElementTimes with tensor lib.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class ColumnElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName() { return L"ColumnElementTimes"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(ColumnElementTimesNode);
|
||||
ColumnElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
void BackpropToMap(const size_t inputIndex)
|
||||
{
|
||||
if (inputIndex > 1)
|
||||
InvalidArgument("ColumnElementTimes operation only takes two inputs.");
|
||||
|
||||
if (inputIndex == 0)
|
||||
{
|
||||
BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
|
||||
}
|
||||
else
|
||||
{
|
||||
BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
|
||||
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
|
||||
|
||||
if (inputIndex == 0)
|
||||
{
|
||||
Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
|
||||
|
||||
BackpropToLeftS(Input(1)->Value(), sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
|
||||
}
|
||||
else
|
||||
{
|
||||
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
|
||||
BackpropToRightS(sliceInput0Value, Input(1)->Gradient(), sliceOutputGrad, *m_tempMatrix);
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The ColumnElementTimesNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
//left (input 0) is a matrix
|
||||
/*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
|
||||
Matrix<ElemType>& input0GradientValues,
|
||||
const Matrix<ElemType>& gradientValues,
|
||||
Matrix<ElemType>& tempMatrix)
|
||||
{
|
||||
tempMatrix.SetValue(gradientValues);
|
||||
tempMatrix.ColumnElementMultiplyWith(input1FunctionValues);
|
||||
input0GradientValues += tempMatrix;
|
||||
|
||||
#if NANCHECK
|
||||
input0GradientValues.HasNan("ColumnElementTimes");
|
||||
#endif
|
||||
}
|
||||
|
||||
//right (input 1) is a col vector
|
||||
/*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
|
||||
Matrix<ElemType>& input1GradientValues,
|
||||
const Matrix<ElemType>& gradientValues,
|
||||
Matrix<ElemType>& tempMatrix)
|
||||
{
|
||||
tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, false);
|
||||
input1GradientValues += tempMatrix;
|
||||
|
||||
#if NANCHECK
|
||||
input1GradientValues.HasNan("ColumnElementTimes");
|
||||
#endif
|
||||
}
|
||||
void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
|
||||
{
|
||||
ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
//if (fr.IsAllFrames()) { ForwardPropMap(); return; }
|
||||
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value());
|
||||
}
|
||||
|
||||
/*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
|
||||
{
|
||||
functionValues.SetValue(input0);
|
||||
functionValues.ColumnElementMultiplyWith(input1);
|
||||
|
||||
#if NANCHECK
|
||||
functionValues.HasNan("ColumnElementTimes");
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase();
|
||||
|
||||
//derive number of rows if possible
|
||||
for (size_t index = 0; index < 2; index++)
|
||||
{
|
||||
size_t rows = Input(index)->GetNumRows() == 0 ? Input(1 - index)->GetNumRows() : Input(index)->GetNumRows();
|
||||
size_t cols = Input(index)->GetNumCols() == 0 ? Input(1 - index)->GetNumCols() : Input(index)->GetNumCols();
|
||||
ValidateInferInputDims(index, rows, cols);
|
||||
}
|
||||
|
||||
size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
|
||||
size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); cols0;
|
||||
if (isFinalValidationPass && (rows0 != rows1 || cols1 != 1))
|
||||
LogicError("ColumnElementTimes: Either the second operand is not a column vector or the number of rows of operands does not match.");
|
||||
|
||||
SetDims(Input(0));
|
||||
}
|
||||
|
||||
//request matrices that are needed for gradient computation
|
||||
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::RequestMatricesBeforeBackprop(matrixPool);
|
||||
RequestMatrixFromPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
//release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
||||
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
private:
|
||||
shared_ptr<Matrix<ElemType>> m_tempMatrix;
|
||||
};
|
||||
|
||||
template class ColumnElementTimesNode<float>;
|
||||
template class ColumnElementTimesNode<double>;
|
||||
#endif
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// DiagTimesNode (vector representing the diagonal of a square matrix, data)
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -1195,7 +811,6 @@ private:
|
|||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
m_pMBLayout = nullptr; // this node does not hold mini-batch data
|
||||
|
||||
SetDims(TensorShape(1), 1);
|
||||
}
|
||||
};
|
||||
|
@ -1207,6 +822,7 @@ private:
|
|||
// SumColumnElementsNode (input)
|
||||
// sums up each column of the input
|
||||
// TODO: This should be deprecated, in favor of a reduce node.
|
||||
// TODO: Implement this with the tensor library.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
|
|
|
@ -5,6 +5,11 @@
|
|||
//
|
||||
#pragma once
|
||||
|
||||
#include "Basics.h"
|
||||
#include "ComputationNode.h"
|
||||
#include "Matrix.h"
|
||||
#include "TensorView.h"
|
||||
|
||||
#include <unordered_set>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
@ -18,27 +23,111 @@
|
|||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
#include "Basics.h"
|
||||
#include "Matrix.h"
|
||||
#include "ComputationNode.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
#ifdef ENABLE_TENSORVIEW
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// NonlinearityNodeBase (input) -- abstract base class that holds what's shared
|
||||
// between non-linearity nodes like Sigmoid
|
||||
// UnaryElementWiseWithOpCodeNodeBase (input) -- base for elementwise unary op
|
||||
// where forward // and backward are single ElementWiseOperator opcodes and
|
||||
// only inputs (but not // function values) are used.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType, ElementWiseOperator opForward, ElementWiseOperator opBackward, bool gradientFromOutput>
|
||||
class UnaryElementWiseWithOpCodeNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
|
||||
public:
|
||||
UnaryElementWiseWithOpCodeNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
static int c = 0; if (c++ == 0) { fprintf(stderr, "#NLop%d#\n", (int)opForward); }
|
||||
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto result = ValueTensorFor(rank, fr);
|
||||
auto input = Input(0)->ValueTensorFor(rank, fr);
|
||||
result.DoUnaryOpOf(0, input, 1, opForward);
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
assert(inputIndex == 0); inputIndex;
|
||||
|
||||
// get the args
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one...
|
||||
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
|
||||
auto sliceValue = gradientFromOutput ? ValueTensorFor(rank, fr) : // using input or output value
|
||||
Input(0)->ValueTensorFor(rank, fr);
|
||||
// If gradient can be compute from output rather than input, then that's better for mem sharing (and faster in most cases).
|
||||
// Not possible for Cos().
|
||||
sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward);
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
ValidateUnaryMap(isFinalValidationPass);
|
||||
}
|
||||
|
||||
// We don't need our output values in backprop.
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
||||
};
|
||||
|
||||
#define UnaryElementWiseWithOpCodeNodeBaseMembers UsingComputationNodeMembersBoilerplate;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// SigmoidNode (input)
|
||||
// TanhNode (input)
|
||||
// RectifiedLinearNode (input)
|
||||
// LogNode (input)
|
||||
// ExpNode (input)
|
||||
// CosineNode (input)
|
||||
// These are all implemented by single-opcode functions and can thus be declared by a macro.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#pragma push_macro("DeclareUnaryTensorOp")
|
||||
#define DeclareUnaryElementWiseWithOpCodeNode(Name, Forward, Backward, gradientFromOutput) \
|
||||
template<class ElemType> \
|
||||
class Name ## Node : public UnaryElementWiseWithOpCodeNodeBase<ElemType, op ## Forward, op ## Backward, gradientFromOutput> \
|
||||
{ \
|
||||
typedef UnaryElementWiseWithOpCodeNodeBase<ElemType, op ## Forward, op ## Backward, gradientFromOutput> Base; UnaryElementWiseWithOpCodeNodeBaseMembers; \
|
||||
static const std::wstring TypeName() { return L ## #Name; } \
|
||||
public: \
|
||||
DeclareConstructorFromConfigWithNumInputs(Name ## Node); \
|
||||
Name ## Node(DEVICEID_TYPE deviceId, const wstring & Name) : \
|
||||
Base(deviceId, Name) \
|
||||
{ } \
|
||||
}
|
||||
|
||||
// Name Forward and Backward opcodes
|
||||
DeclareUnaryElementWiseWithOpCodeNode(Sigmoid, Sigmoid, ElementwiseProductWithSigmoidDerivativeFromOutput, true);
|
||||
DeclareUnaryElementWiseWithOpCodeNode(Tanh, Tanh, ElementwiseProductWithTanhDerivativeFromOutput, true);
|
||||
DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear, LinearRectifier, ElementwiseProductWithLinearRectifierDerivativeFromOutput, true);
|
||||
DeclareUnaryElementWiseWithOpCodeNode(Log, Log, ElementwiseProductWithLogDerivativeFromOutput, true);
|
||||
DeclareUnaryElementWiseWithOpCodeNode(Exp, Exp, ElementwiseProduct, true);
|
||||
DeclareUnaryElementWiseWithOpCodeNode(Cosine, Cosine, ElementwiseProductWithCosDerivative, false);
|
||||
|
||||
#pragma pop_macro("DeclareUnaryTensorOp")
|
||||
#endif
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// SoftmaxNodeBase (input) -- shared base of Softmax and LogSoftmax
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// shared base for all element-wise non-linearities
|
||||
// What this adds over a ComputationNode<ElemType> is a member m_gradientTemp for temp use by derived classes.
|
||||
// TODO: This was used more broadly, but no longer, so we may be able to simplify the signatures of the virtual functions.
|
||||
template<class ElemType>
|
||||
class NonlinearityNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
|
||||
class SoftmaxNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
|
||||
public:
|
||||
//virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) = 0;
|
||||
DeclareConstructorFromConfigWithNumInputs(NonlinearityNodeBase);
|
||||
NonlinearityNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
DeclareConstructorFromConfigWithNumInputs(SoftmaxNodeBase);
|
||||
SoftmaxNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
|
@ -54,7 +143,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
auto sliceOutputValue = OutputUsedInComputingInputNodesGradients() ? ValueFor(fr) : Matrix<ElemType>();
|
||||
|
||||
// do the actual operation
|
||||
// TODO: Once all is unified then make the order of arguments more logical (in -> out)
|
||||
BackpropToV(*m_gradientTemp, sliceInputValue, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
|
||||
}
|
||||
|
||||
|
@ -80,7 +168,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Base::CopyTo(nodeP, newName, flags);
|
||||
if (flags & CopyNodeFlags::copyNodeValue)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<NonlinearityNodeBase<ElemType>>(nodeP);
|
||||
auto node = dynamic_pointer_cast<SoftmaxNodeBase<ElemType>>(nodeP);
|
||||
*node->m_gradientTemp = *m_gradientTemp;
|
||||
}
|
||||
}
|
||||
|
@ -102,296 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
shared_ptr<Matrix<ElemType>> m_gradientTemp;
|
||||
};
|
||||
|
||||
#define UsingNonlinearityNodeBaseMembers UsingComputationNodeMembersBoilerplate; using Base::m_gradientTemp
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// RectifiedLinearNode (input) -- ReLU non-linearity
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class RectifiedLinearNode : public NonlinearityNodeBase<ElemType>
|
||||
{
|
||||
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"RectifiedLinear"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(RectifiedLinearNode);
|
||||
RectifiedLinearNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
NonlinearityNodeBase<ElemType>(deviceId, name)
|
||||
{ }
|
||||
|
||||
void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override
|
||||
{
|
||||
gradient.AssignLinearRectifierDerivativeOf(inputFunctionValues);
|
||||
#if DUMPOUTPUT
|
||||
inputGradientValues.Print("RecitifiedLinearNode-Partial-in");
|
||||
#endif
|
||||
inputGradientValues.AddElementProductOf(gradientValues, gradient);
|
||||
#if DUMPOUTPUT
|
||||
inputGradientValues.Print("RecitifiedLinearNode-Partial-out");
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The ReLU node does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignTruncateBottomOf(inputFunctionValues, 0);
|
||||
#if DUMPOUTPUT
|
||||
functionValues.Print("RectifiedLinearNode");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template class RectifiedLinearNode<float>;
|
||||
template class RectifiedLinearNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// SigmoidNode (input) -- sigmoid non-linearity
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class SigmoidNode : public NonlinearityNodeBase<ElemType>
|
||||
{
|
||||
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Sigmoid"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(SigmoidNode);
|
||||
SigmoidNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
NonlinearityNodeBase<ElemType>(deviceId, name)
|
||||
{ }
|
||||
|
||||
#ifdef ENABLE_TENSORVIEW
|
||||
// TODO: Once tensor lib works, we will change all nodes in here to use it. Then move ForwardProp() and BackpropTo() from here into base.
|
||||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto result = ValueTensorFor(rank, fr);
|
||||
auto input = Input(0)->ValueTensorFor(rank, fr);
|
||||
ForwardPropV(input, result);
|
||||
}
|
||||
|
||||
/*virtual*/ void ForwardPropV(const TensorView<ElemType>& input, TensorView<ElemType>& result) //override
|
||||
{
|
||||
result.AssignSigmoidOf(input);
|
||||
}
|
||||
|
||||
virtual void /*IComputationNode::*/BeginBackprop() override // called before first iteration step of ComputeGradient()
|
||||
{
|
||||
m_gradientTemp->Resize(GetNumRows(), GetNumCols());
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
assert(inputIndex == 0); inputIndex;
|
||||
|
||||
// get the args
|
||||
// Some do not consume input and/or output values. Don't touch those, pass dummies instead, since memshare may have taken them away already.
|
||||
size_t rank = DetermineElementwiseTensorRank();
|
||||
auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one...
|
||||
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
|
||||
auto sliceInputValue = InputUsedInComputingInputNodesGradients(0) ? Input(0)->ValueTensorFor(rank, fr) : TensorView<ElemType>();
|
||||
auto sliceOutputValue = OutputUsedInComputingInputNodesGradients() ? ValueTensorFor(rank, fr) : TensorView<ElemType>();
|
||||
|
||||
// do the actual operation
|
||||
// TODO: Once all is unified then make the order of arguments more logical (in -> out)
|
||||
BackpropToV(DataTensorFor(*m_gradientTemp, rank, fr), sliceInputValue, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
|
||||
}
|
||||
|
||||
/*virtual*/ void BackpropToV(TensorView<ElemType> gradient, const TensorView<ElemType>& inputFunctionValues, TensorView<ElemType> inputGradientValues, const TensorView<ElemType>& gradientValues, const TensorView<ElemType>& functionValues)
|
||||
{
|
||||
gradient.AssignSigmoidDerivativeOf(inputFunctionValues);
|
||||
inputGradientValues.AddElementwiseProductOf(gradientValues, gradient);
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
||||
#else
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
|
||||
{
|
||||
// The Sigmoid node does not require any of it's input's values for computing
|
||||
// the gradients of its input nodes
|
||||
UNREFERENCED_PARAMETER(childIndex);
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
|
||||
{
|
||||
gradient.AssignSigmoidDerivativeOf(functionValues);
|
||||
inputGradientValues.AddElementProductOf(gradientValues, gradient);
|
||||
}
|
||||
|
||||
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignSigmoidOf(inputFunctionValues);
|
||||
}
|
||||
};
|
||||
|
||||
template class SigmoidNode<float>;
|
||||
template class SigmoidNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// TanhNode (input) -- tanh non-linearity
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class TanhNode : public NonlinearityNodeBase<ElemType>
|
||||
{
|
||||
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Tanh"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(TanhNode);
|
||||
TanhNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
NonlinearityNodeBase<ElemType>(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
|
||||
{
|
||||
// The plus node does not require any of it's input's values for computing
|
||||
// the gradients of its input nodes
|
||||
UNREFERENCED_PARAMETER(childIndex);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
|
||||
{
|
||||
gradient.AssignElementProductOf(functionValues, functionValues); // v .* v
|
||||
gradient.AssignDifferenceOf(1, gradient); // 1-v^2
|
||||
|
||||
inputGradientValues.AddElementProductOf(gradientValues, gradient); // += d .* ((1-v) .* v))
|
||||
}
|
||||
|
||||
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignTanhOf(inputFunctionValues);
|
||||
}
|
||||
};
|
||||
|
||||
template class TanhNode<float>;
|
||||
template class TanhNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// LogNode (input) -- component-wise log() of input
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class LogNode : public NonlinearityNodeBase<ElemType>
|
||||
{
|
||||
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Log"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(LogNode);
|
||||
LogNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
NonlinearityNodeBase<ElemType>(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The plus node does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
|
||||
{
|
||||
gradient.AssignElementInverseOf(inputFunctionValues); // 1/x (x is input to log(x))
|
||||
inputGradientValues.AddElementProductOf(gradientValues, gradient);
|
||||
}
|
||||
|
||||
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignLogOf(inputFunctionValues);
|
||||
}
|
||||
};
|
||||
|
||||
template class LogNode<float>;
|
||||
template class LogNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ExpNode (input) -- component-wise exp() of input
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class ExpNode : public NonlinearityNodeBase<ElemType>
|
||||
{
|
||||
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Exp"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(ExpNode);
|
||||
ExpNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
NonlinearityNodeBase<ElemType>(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
assert(inputIndex == 0); inputIndex;
|
||||
|
||||
Matrix<ElemType> sliceInputGrad = Input(0)->GradientFor(fr);
|
||||
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
|
||||
Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
|
||||
|
||||
m_gradientTemp->AssignExpOf(sliceInputValue); // Exp(x) is its own partial
|
||||
sliceInputGrad.AddElementProductOf(sliceOutputGrad, *m_gradientTemp);
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The ExpNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override { NOT_IMPLEMENTED; } // not needed
|
||||
|
||||
void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignExpOf(inputFunctionValues);
|
||||
}
|
||||
};
|
||||
|
||||
template class ExpNode<float>;
|
||||
template class ExpNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// CosineNode (input) -- component-wise cos() of input
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class CosineNode : public NonlinearityNodeBase<ElemType>
|
||||
{
|
||||
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Cosine"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(CosineNode);
|
||||
CosineNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
NonlinearityNodeBase<ElemType>(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The CosineNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
/*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
|
||||
{
|
||||
gradient.AssignNegativeSineOf(inputFunctionValues); // -sin(x) (x is input to Cosine(x))
|
||||
inputGradientValues.AddElementProductOf(gradientValues, gradient);
|
||||
}
|
||||
|
||||
/*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
|
||||
{
|
||||
functionValues.AssignCosineOf(inputFunctionValues);
|
||||
}
|
||||
};
|
||||
|
||||
template class CosineNode<float>;
|
||||
template class CosineNode<double>;
|
||||
#define UsingSoftmaxNodeBaseMembers UsingComputationNodeMembersBoilerplate; using Base::m_gradientTemp
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// SoftmaxNode (input) -- soft-max over input vector(s)
|
||||
|
@ -400,14 +199,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
//we assume it's column-wise by default
|
||||
//the derivative will increase the Matrix<ElemType> size to the power of column size and should not be used.
|
||||
template<class ElemType>
|
||||
class SoftmaxNode : public NonlinearityNodeBase<ElemType>
|
||||
class SoftmaxNode : public SoftmaxNodeBase<ElemType>
|
||||
{
|
||||
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
|
||||
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Softmax"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(SoftmaxNode);
|
||||
SoftmaxNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
NonlinearityNodeBase<ElemType>(deviceId, name)
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
|
||||
|
@ -467,14 +266,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class LogSoftmaxNode : public NonlinearityNodeBase<ElemType>
|
||||
class LogSoftmaxNode : public SoftmaxNodeBase<ElemType>
|
||||
{
|
||||
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
|
||||
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"LogSoftmax"; }
|
||||
public:
|
||||
DeclareConstructorFromConfigWithNumInputs(LogSoftmaxNode);
|
||||
LogSoftmaxNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
NonlinearityNodeBase<ElemType>(deviceId, name)
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
|
||||
|
@ -1040,9 +839,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// this node is not differentiable and so cannot be used in the backpropagation
|
||||
// TODO: make function value sparse?
|
||||
template<class ElemType>
|
||||
class HardmaxNode : public NonlinearityNodeBase/*ComputationNode*/<ElemType>
|
||||
class HardmaxNode : public SoftmaxNodeBase/*ComputationNode*/<ElemType>
|
||||
{
|
||||
typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
|
||||
typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Hardmax"; }
|
||||
|
||||
public:
|
||||
|
|
|
@ -5,6 +5,11 @@
|
|||
//
|
||||
#pragma once
|
||||
|
||||
#include "Basics.h"
|
||||
#include "Matrix.h"
|
||||
#include "TensorShape.h"
|
||||
#include "ComputationNode.h"
|
||||
|
||||
#include <unordered_set>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
@ -18,10 +23,6 @@
|
|||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
#include "Basics.h"
|
||||
#include "Matrix.h"
|
||||
#include "ComputationNode.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -86,33 +87,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
typedef std::shared_ptr<DelayedValueNodeState<ElemType>> DelayedNodeStatePtr;
|
||||
static const std::wstring TypeName() { return L"DelayedValue"; }
|
||||
private:
|
||||
void Init(size_t row_size, size_t col_size, ElemType initialActivationValue = (ElemType)DEFAULT_HIDDEN_ACTIVATION)
|
||||
void Init(const TensorShape & sampleLayout, ElemType initialActivationValue)
|
||||
{
|
||||
m_initialActivationValue = initialActivationValue;
|
||||
m_timeStep = 1;
|
||||
CreateMatrixIfNull(m_value);
|
||||
SetDims(TensorShape(row_size), col_size); // TODO: needed? Can we not infer it? How about setting a sample layout?
|
||||
m_isHistoryCarryOverManagedExternally = false; // used for PairNetworkNode/PastValueNode combination
|
||||
SetDims(sampleLayout, 0); // TODO: needed? Can we not infer it? How about setting a sample layout?
|
||||
m_isHistoryCarryOverManagedExternally = false; // used for PairNetworkNode/PastValueNode combination, which is deprecated
|
||||
m_value->SetValue(m_initialActivationValue); // is this needed?
|
||||
}
|
||||
protected:
|
||||
DelayedValueNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name),
|
||||
m_delayedActivation(deviceId)
|
||||
{
|
||||
Init(1, 1);
|
||||
Init(TensorShape(), (ElemType)DEFAULT_HIDDEN_ACTIVATION);
|
||||
}
|
||||
DelayedValueNodeBase(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t row_size, size_t col_size, size_t timeStep) :
|
||||
DelayedValueNodeBase(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, const TensorShape & sampleLayout, size_t timeStep) :
|
||||
Base(deviceId, name),
|
||||
m_delayedActivation(deviceId)
|
||||
{
|
||||
Init(row_size, col_size, initialActivationValue);
|
||||
|
||||
m_timeStep = (int)timeStep;
|
||||
|
||||
m_value->SetValue(m_initialActivationValue);
|
||||
Init(sampleLayout, initialActivationValue);
|
||||
m_timeStep = (int)timeStep; // TODO: pass this to Init() instead as well
|
||||
}
|
||||
DelayedValueNodeBase(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
DelayedValueNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"defaultHiddenActivation"), configp->Get(L"rows"), configp->Get(L"cols"), configp->Get(L"timeStep"))
|
||||
DelayedValueNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"defaultHiddenActivation"), configp->Get(L"shape"), configp->Get(L"timeStep"))
|
||||
{
|
||||
// We do NOT attach the inputs, as we cannot resolve them without causing a circular reference.
|
||||
// Instead, we capture them in a lambda, which will be called by ComputationNetwork during the build process through LateAttachInputs() below.
|
||||
|
@ -593,8 +592,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
PastValueNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
PastValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t row_size, size_t col_size, size_t timeStep) :
|
||||
Base(deviceId, name, initialActivationValue, row_size, col_size, timeStep)
|
||||
PastValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, const TensorShape & sampleLayout, size_t timeStep) :
|
||||
Base(deviceId, name, initialActivationValue, sampleLayout, timeStep)
|
||||
{ }
|
||||
PastValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t numRows, size_t timeStep) :
|
||||
PastValueNode(deviceId, name, initialActivationValue, TensorShape(numRows), timeStep)
|
||||
{ }
|
||||
PastValueNode(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
Base(configp)
|
||||
|
@ -619,8 +621,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name) :
|
||||
Base(deviceId, name)
|
||||
{ }
|
||||
FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t row_size, size_t col_size, size_t timeStep) :
|
||||
Base(deviceId, name, initialActivationValue, row_size, col_size, timeStep)
|
||||
FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, const TensorShape & sampleLayout, size_t timeStep) :
|
||||
Base(deviceId, name, initialActivationValue, sampleLayout, timeStep)
|
||||
{ }
|
||||
FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t numRows, size_t timeStep) :
|
||||
FutureValueNode(deviceId, name, initialActivationValue, TensorShape(numRows), timeStep)
|
||||
{ }
|
||||
FutureValueNode(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
Base(configp)
|
||||
|
|
|
@ -126,8 +126,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
#define UsingReinterpretNodeBaseMembers UsingComputationNodeMembersBoilerplate
|
||||
|
||||
// TODO: This ReshapeNode is currently not used. Its function will be taken over by Transpose and the Reshape that follows this one below.
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ReshapeNode (input) -- reinterpret input matrix as having different dimensions
|
||||
// DeprecatedReshapeNode (input) -- reinterpret input matrix as having different dimensions
|
||||
// where the new row dimension is given, and the column dimension is inferred.
|
||||
// Also optionally associate a different TensorShape with the data.
|
||||
//
|
||||
|
@ -149,7 +151,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// E.g. ReinterpretRowStackAsSequence and ReinterpretSequenceAsRowStack.
|
||||
// BUGBUG: This is not actually implemented yet. Instead, it goes from 1 to K steps or from K to 1 step. This is temporary/experimental, until the plumbing for nesting is there.
|
||||
//
|
||||
// Thirdly, ReshapeNode can also be used to update only the TensorShape. In that case, the MBLayout is kept as is.
|
||||
// Thirdly, DeprecatedReshapeNode can also be used to update only the TensorShape. In that case, the MBLayout is kept as is.
|
||||
//
|
||||
// Note: The new row dimension must be a straight multiple or divisor of the current row dimension.
|
||||
// To reshape to a non-multiple go to row dim 1 first.
|
||||
|
@ -159,19 +161,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class ReshapeNode : public ReinterpretNodeBase<ElemType>
|
||||
class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
|
||||
{
|
||||
typedef ReinterpretNodeBase<ElemType> Base; UsingReinterpretNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Reshape"; }
|
||||
static const std::wstring TypeName() { return L"DeprecatedReshape"; }
|
||||
public:
|
||||
ReshapeNode(DEVICEID_TYPE deviceId, const wstring & name, size_t numRows = 0, const TensorShape & imageLayout = TensorShape()) :
|
||||
DeprecatedReshapeNode(DEVICEID_TYPE deviceId, const wstring & name, size_t numRows = 0, const TensorShape & imageLayout = TensorShape()) :
|
||||
Base(deviceId, name),
|
||||
m_numTargetRows(numRows),
|
||||
m_targetImageLayout(imageLayout)
|
||||
{ }
|
||||
ReshapeNode(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
ReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"numRows"), ImageLayoutWHC(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels")))
|
||||
DeprecatedReshapeNode(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
DeprecatedReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"numRows"), ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKind::HWC/*legacy*/))
|
||||
{
|
||||
// BUGBUG: We should not operate on image layouts here, but on a proper tensor layout.
|
||||
AttachInputs(configp, this->GetExpectedNumInputs());
|
||||
}
|
||||
|
||||
|
@ -180,7 +183,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
Base::CopyTo(nodeP, newName, flags);
|
||||
if (flags & CopyNodeFlags::copyNodeValue)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<ReshapeNode<ElemType>>(nodeP);
|
||||
auto node = dynamic_pointer_cast<DeprecatedReshapeNode<ElemType>>(nodeP);
|
||||
node->m_numTargetRows = m_numTargetRows;
|
||||
node->m_targetImageLayout = m_targetImageLayout;
|
||||
}
|
||||
|
@ -197,7 +200,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
Base::Load(fstream, modelVersion);
|
||||
fstream >> m_numTargetRows;
|
||||
m_targetImageLayout.Load(fstream);
|
||||
m_targetImageLayout.Load(fstream, /*acceptLegacyFormat=*/true);
|
||||
}
|
||||
|
||||
virtual void /*IComputationNode::*/PrintSelfBeforeValidation() const override
|
||||
|
@ -214,7 +217,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
else
|
||||
fprintf(stderr, "%ls[%lu, %lu]", child->NodeName().c_str(), child->GetNumRows(), child->GetNumCols());
|
||||
}
|
||||
fprintf(stderr, ", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout.GetWidth(), m_targetImageLayout.GetHeight(), m_targetImageLayout.GetNumChannels());
|
||||
fprintf(stderr, ", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout[1], m_targetImageLayout[2], m_targetImageLayout[0]);
|
||||
// BUGBUG: This interpretaion as image dims is only correct for the 'legacy format, not for cudnn.
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
|
||||
|
@ -247,7 +251,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
// setting any dimension to 0 means lose the tensor, flatten to vector
|
||||
// TODO: We can use 0 to indicate "infer". One value can be 0. It will be filled in to match row dim.
|
||||
if (m_targetImageLayout.GetWidth() == 0 || m_targetImageLayout.GetHeight() == 0 || m_targetImageLayout.GetNumChannels() == 0)
|
||||
if (m_targetImageLayout[1] == 0 || m_targetImageLayout[2] == 0 || m_targetImageLayout[0] == 0)
|
||||
{
|
||||
if (Input(0)->HasSampleLayout())
|
||||
fprintf(stderr, "WARNING: Reshape operation cannot inherit image size information from its child. Image size info is lost.\n");
|
||||
|
@ -257,7 +261,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
else
|
||||
{
|
||||
if (m_numTargetRows != m_targetImageLayout.GetNumElements())
|
||||
LogicError("ReshapeNode: InferTargetSampleLayout() computed a sample layout [%s] that mismatches m_numTargetRows %d.", string(m_targetImageLayout).c_str(), (int)m_numTargetRows);
|
||||
LogicError("DeprecatedReshapeNode: InferTargetSampleLayout() computed a sample layout [%s] that mismatches m_numTargetRows %d.", string(m_targetImageLayout).c_str(), (int)m_numTargetRows);
|
||||
SetDims(m_targetImageLayout, newCols);
|
||||
}
|
||||
}
|
||||
|
@ -289,7 +293,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
// going from many samples to one: layout entry will get no flags
|
||||
if (Input(0)->GetNumTimeSteps() * Input(0)->GetNumRows() / m_numTargetRows != 1)
|
||||
LogicError("ReshapeNode::BeginForwardProp() faking to remove a nested time dimension only works when going back to a single frame per sequence.");
|
||||
LogicError("DeprecatedReshapeNode::BeginForwardProp() faking to remove a nested time dimension only works when going back to a single frame per sequence.");
|
||||
// we are in frame mode now
|
||||
m_pMBLayout->InitAsFrameMode(Input(0)->GetNumParallelSequences());
|
||||
}
|
||||
|
@ -297,7 +301,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
// going from one sample to many: layout will get SentenceStart/SentenceEnd flags for the sequence we expand into
|
||||
if (Input(0)->GetMBLayout()->GetNumTimeSteps() != 1)
|
||||
LogicError("ReshapeNode::BeginForwardProp() faking to add a nested time dimension only works when coming from a single frame per sequence.");
|
||||
LogicError("DeprecatedReshapeNode::BeginForwardProp() faking to add a nested time dimension only works when coming from a single frame per sequence.");
|
||||
m_pMBLayout->Init(Input(0)->GetNumParallelSequences(), Input(0)->GetNumTimeSteps() * Input(0)->GetNumRows() / m_numTargetRows);
|
||||
for (size_t s = 0; s < m_pMBLayout->GetNumParallelSequences(); s++)
|
||||
m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, s, 0, m_pMBLayout->GetNumTimeSteps());
|
||||
|
@ -325,7 +329,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// layout case: reshape semantics happens across parallel seqeunces, i.e. requiring data shuffling
|
||||
else
|
||||
{
|
||||
// TODO: It does not make sense to run ReshapeNode frame-by-frame inside a loop, because it changes the time base.
|
||||
// TODO: It does not make sense to run DeprecatedReshapeNode frame-by-frame inside a loop, because it changes the time base.
|
||||
// However, in the future, we should be able to run inside an outer loop.
|
||||
if (!fr.IsAllFrames())
|
||||
InvalidArgument("%ls %ls operation cannot be run from inside a loop since it changes the time base.", NodeName().c_str(), OperationName().c_str());
|
||||
|
@ -358,14 +362,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The ReshapeNode does not require its output value for computing
|
||||
// The DeprecatedReshapeNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
|
||||
{
|
||||
// The ReshapeNode does not require any of it's input's values for computing
|
||||
// The DeprecatedReshapeNode does not require any of it's input's values for computing
|
||||
// the gradients of its input nodes
|
||||
UNREFERENCED_PARAMETER(childIndex);
|
||||
return false;
|
||||
|
@ -377,35 +381,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t factor() const { return m_numTargetRows > Input(0)->GetNumRows() ? m_numTargetRows / Input(0)->GetNumRows() : Input(0)->GetNumRows() / m_numTargetRows; } // factor by which we stack or unstack
|
||||
TensorShape m_targetImageLayout;
|
||||
|
||||
// this patches up m_targetImageLayout according to some rules
|
||||
// TODO: Say in one sentence what this logic does.
|
||||
// This infers dimensions in m_targetImageLayout.
|
||||
// Users are allowed to provide 2 (out of 3) image dimensions.
|
||||
// One missing dimension can be inferred. If two dimensions are
|
||||
// unspecified it throws a runtime error.
|
||||
// TODO: Generalize this to any number of dimensions.
|
||||
void InferTargetSampleLayout()
|
||||
{
|
||||
if (m_targetImageLayout.GetWidth() > 0)
|
||||
// BUGBUG: Below is the result of refactoring and only works for rank-3 tensors. Generalize.
|
||||
if (m_targetImageLayout[1] > 0)
|
||||
{
|
||||
if (m_targetImageLayout.GetHeight() > 0)
|
||||
if (m_targetImageLayout[2] > 0)
|
||||
{
|
||||
if (m_targetImageLayout.GetNumChannels() > 0)
|
||||
if (m_targetImageLayout[0] > 0)
|
||||
{
|
||||
if (m_targetImageLayout.GetNumElements() != m_numTargetRows)
|
||||
RuntimeError("Image dimensions do not match row size.");
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m_numTargetRows % (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetHeight()) > 0)
|
||||
if (m_numTargetRows % (m_targetImageLayout[1] * m_targetImageLayout[2]) > 0)
|
||||
RuntimeError("Image row size is not a multiple of specified image dimensions.");
|
||||
else
|
||||
m_targetImageLayout = ImageLayoutWHC(m_targetImageLayout.GetWidth(), m_targetImageLayout.GetHeight(), m_numTargetRows / (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetHeight()));
|
||||
m_targetImageLayout = TensorShape(m_numTargetRows / (m_targetImageLayout[1] * m_targetImageLayout[2]), m_targetImageLayout[1], m_targetImageLayout[2]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m_targetImageLayout.GetNumChannels() > 0)
|
||||
if (m_targetImageLayout[0] > 0)
|
||||
{
|
||||
if (m_numTargetRows % (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetNumChannels()) > 0)
|
||||
if (m_numTargetRows % (m_targetImageLayout[1] * m_targetImageLayout[0]) > 0)
|
||||
RuntimeError("Image row size is not a multiple of specified image dimensions.");
|
||||
else
|
||||
m_targetImageLayout = ImageLayoutWHC(m_targetImageLayout.GetWidth(), m_numTargetRows / (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetNumChannels()), m_targetImageLayout.GetNumChannels());
|
||||
m_targetImageLayout = TensorShape(m_targetImageLayout[0], m_targetImageLayout[1], m_numTargetRows / (m_targetImageLayout[1] * m_targetImageLayout[0]));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -415,26 +423,173 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
else
|
||||
{
|
||||
if (m_targetImageLayout.GetHeight() > 0)
|
||||
if (m_targetImageLayout[2] > 0)
|
||||
{
|
||||
if (m_targetImageLayout.GetNumChannels() > 0)
|
||||
if (m_targetImageLayout[0] > 0)
|
||||
{
|
||||
if (m_numTargetRows % (m_targetImageLayout.GetHeight() * m_targetImageLayout.GetNumChannels()) > 0)
|
||||
if (m_numTargetRows % (m_targetImageLayout[2] * m_targetImageLayout[0]) > 0)
|
||||
RuntimeError("Image row size is not a multiple of specified image dimensions.");
|
||||
else
|
||||
m_targetImageLayout = ImageLayoutWHC(m_numTargetRows / (m_targetImageLayout.GetHeight() * m_targetImageLayout.GetNumChannels()), m_targetImageLayout.GetHeight(), m_targetImageLayout.GetNumChannels());
|
||||
m_targetImageLayout = TensorShape(m_targetImageLayout[0], m_numTargetRows / (m_targetImageLayout[2] * m_targetImageLayout[0]), m_targetImageLayout[2]);
|
||||
}
|
||||
else
|
||||
RuntimeError("At least two image dimensions must be specified.");
|
||||
}
|
||||
else if (m_targetImageLayout.GetNumChannels() > 0)
|
||||
else if (m_targetImageLayout[0] > 0)
|
||||
RuntimeError("At least two image dimensions must be specified.");
|
||||
else
|
||||
m_targetImageLayout = ImageLayoutWHC(m_numTargetRows, 1, 1);
|
||||
m_targetImageLayout = TensorShape(1, m_numTargetRows, 1);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template class DeprecatedReshapeNode<float>;
|
||||
template class DeprecatedReshapeNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Reshape(x, tensorShape, beginDim=0, endDim=0) -- reinterpret input samples as having different tensor dimensions
|
||||
// - just replaces metadata m_sampleLayout, does not change data values
|
||||
// - one dimension may be specified as 0 and will be inferred
|
||||
// - optional beginDim/endDim denote to only replace a sub-range of dims, for implementing ReshapeDimension() and FlattenRank()
|
||||
// - may not be applied to time; use Permute() or Transpose()
|
||||
//
|
||||
// Derived operations:
|
||||
//
|
||||
// ReshapeDimension(x, dim, tensorShape) = Reshape(x, tensorShape, beginDim=dim, endDim=dim+1)
|
||||
// - reinterprets one dimension as multiple, where the number of elements remains the same
|
||||
// - one of the new dimensions may be specified as 0 and will be inferred
|
||||
//
|
||||
// FlattenDimensions(x, dim, num) = Reshape(x, 0, beginDim=dim, endDim=dim+num)
|
||||
// - replace two or more consecutive dims by a single dim with the same number of elements
|
||||
//
|
||||
// SplitDimension(x, dim, N) = ReshapeDimension(x, dim, 0:N)
|
||||
// - splits a dimension into a new tensor dimension, injecting them into a new dimension
|
||||
// - to split stacked frames into a new time dimension:
|
||||
// insert new time dim with ReshapeDimension(., -1, 0:1), SplitDimension(., dim, N), Transpose(., dim+1, -1), then Select(., dim+1, 0) away the new time dim
|
||||
// This would make 4 copies presently. We may need a compound C++ node for now.
|
||||
// - note: to split into multiple outputs (like tf.split()), use a BrainScript loop with Slice().
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
class ReshapeNode : public UnaryElementWiseNode<ElemType>
|
||||
{
|
||||
typedef UnaryElementWiseNode<ElemType> Base; UsingUnaryElementwiseNodeBaseMembers;
|
||||
static const std::wstring TypeName() { return L"Reshape"; }
|
||||
public:
|
||||
ReshapeNode(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & replacementSampleLayout = TensorShape(), int beginDim = 1, int endDim = 0) :
|
||||
Base(deviceId, name),
|
||||
m_replacementSampleLayout(replacementSampleLayout), m_beginDimParameter(beginDim), m_endDimParameter(endDim)
|
||||
{ }
|
||||
ReshapeNode(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
ReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"shape"), configp->Get(L"beginDim"), configp->Get(L"endDim"))
|
||||
{
|
||||
AttachInputs(configp, this->GetExpectedNumInputs());
|
||||
}
|
||||
|
||||
virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
||||
{
|
||||
Base::CopyTo(nodeP, newName, flags);
|
||||
if (flags & CopyNodeFlags::copyNodeValue)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<ReshapeNode<ElemType>>(nodeP);
|
||||
node->m_replacementSampleLayout = m_replacementSampleLayout;
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Save(File& fstream) const override
|
||||
{
|
||||
Base::Save(fstream);
|
||||
fstream << m_beginDimParameter << m_endDimParameter;
|
||||
m_replacementSampleLayout.Save(fstream);
|
||||
}
|
||||
|
||||
virtual void Load(File& fstream, size_t modelVersion) override
|
||||
{
|
||||
Base::Load(fstream, modelVersion);
|
||||
fstream >> m_beginDimParameter >> m_endDimParameter;
|
||||
m_replacementSampleLayout.Load(fstream);
|
||||
}
|
||||
|
||||
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
|
||||
// BUGBUG: For inputs without MBLayout, the sample layout should include the column dimension, but it does not currently. Needs to be fleshed out.
|
||||
const auto & inputSampleLayout = Input(0)->GetSampleLayout();
|
||||
const auto & inputDims = inputSampleLayout.GetDims();
|
||||
|
||||
auto replacementDims = m_replacementSampleLayout.GetDims();
|
||||
|
||||
size_t beginDim = m_beginDimParameter > 0 ? m_beginDimParameter - 1 : 0;
|
||||
size_t endDim = m_endDimParameter > 0 ? m_endDimParameter - 1 : inputDims.size();
|
||||
if (!isFinalValidationPass) // non-final: be tolerant, no errors
|
||||
{
|
||||
if (endDim > inputDims.size())
|
||||
endDim = inputDims.size();
|
||||
if (beginDim > endDim)
|
||||
beginDim = endDim;
|
||||
}
|
||||
|
||||
// TODO: We should allow to reduce to a 0-length tensor if the dimension is 0
|
||||
|
||||
// if a dimension is specified as zero then infer it, otherwise verify that total #elements matches
|
||||
size_t inputElements = 1; // get #elements in range to be replaced
|
||||
for (size_t k = beginDim; k < endDim; k++)
|
||||
inputElements *= inputDims[k];
|
||||
size_t targetElements = 1; // check/infer #elements to replace with
|
||||
size_t zeroIndex = SIZE_MAX;
|
||||
for (size_t k = 0; k < replacementDims.size(); k++)
|
||||
{
|
||||
if (replacementDims[k] != 0)
|
||||
targetElements *= replacementDims[k];
|
||||
else if (zeroIndex == SIZE_MAX)
|
||||
zeroIndex = k;
|
||||
else
|
||||
InvalidArgument("%ls %ls operation: More than one dimension was specified as zero in the replacement (sub-)dimensions [%s]", NodeName().c_str(), OperationName().c_str(), string(m_replacementSampleLayout).c_str());
|
||||
}
|
||||
if (zeroIndex != SIZE_MAX)
|
||||
replacementDims[zeroIndex] = inputElements / targetElements; // infer the number (ignore errors at this point)
|
||||
|
||||
// assemble actual full dimension vector
|
||||
SmallVector<size_t> dims;
|
||||
dims.append(inputDims.begin(), inputDims.begin() + beginDim);
|
||||
dims.append(replacementDims.begin(), replacementDims.end());
|
||||
dims.append(inputDims.begin() + endDim, inputDims.end());
|
||||
auto sampleLayout = TensorShape(dims);
|
||||
|
||||
// validate total dimension
|
||||
if (isFinalValidationPass && inputSampleLayout.GetNumElements() != sampleLayout.GetNumElements())
|
||||
{
|
||||
auto subShape = TensorShape(std::vector<size_t>(inputDims.begin() + beginDim, inputDims.begin() + endDim));
|
||||
InvalidArgument("%ls %ls operation: Input (sub-)dimensions [%s] incompatible with desired (sub-)dimensions [%s]. Number of elements %s.",
|
||||
NodeName().c_str(), OperationName().c_str(),
|
||||
string(subShape).c_str(), string(m_replacementSampleLayout).c_str(),
|
||||
zeroIndex == SIZE_MAX ? "must be the same" : "is not an integer multiple of the non-0 dimensions");
|
||||
}
|
||||
|
||||
// that's it
|
||||
SetDims(sampleLayout, 0); // BUGBUG: This is incorrect if we have no MBLayout, e.g. reshaping a bias vector into a different tensor dimension
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
|
||||
{
|
||||
ValueFor(fr).SetValue(Input(0)->ValueFor(fr));
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
|
||||
{
|
||||
Input(inputIndex)->GradientFor(fr).SetValue(GradientFor(fr));
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
|
||||
|
||||
private:
|
||||
TensorShape m_replacementSampleLayout; // user-specified dimensions to replace dimensions [beginDim, endDim]
|
||||
int m_beginDimParameter; // 1-based index range as specified
|
||||
int m_endDimParameter;
|
||||
};
|
||||
|
||||
template class ReshapeNode<float>;
|
||||
template class ReshapeNode<double>;
|
||||
|
||||
|
@ -811,4 +966,196 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template class RowRepeatNode<float>;
|
||||
template class RowRepeatNode<double>;
|
||||
|
||||
/*
|
||||
|
||||
notes on tensor operations
|
||||
==========================
|
||||
|
||||
reshaping
|
||||
---------
|
||||
|
||||
- on dimension index 'dim' and 'tensorShape'
|
||||
- tensorShape: a vector of dimensions, e.g. 640:480:3:30 could describe a 1-second RGB video of VGA dimensions at 30 fps
|
||||
- 'dim' specifies a specific tensor index
|
||||
- dim > 0 is a regular sample index. E.g. for a matrix, dim=1 would be the row dimension, and dim=2 in the above example has dimension 480.
|
||||
- dim < 0 denote time indices (recurrent loops). Rank=-1 is the innermost time index.
|
||||
- dim = 0 denotes the index of the parallel sequence
|
||||
- Since all operations logically operate on a single sequence, i.e. parallel sequences generally cannot be indexed by the user.
|
||||
- Exceptions: training criteria, BatchNormalization, ...WithNegativeSamples (we should not need this)
|
||||
- I don't like that 'dim' refers to the index of the dimension as well as the number of elements in that dimension. Axis (numpy)?
|
||||
|
||||
- Reshaping: --these are all implemented in C++ by DeprecatedReshapeNode
|
||||
- Reshape(x, tensorShape, beginDim=0, endDim=0)
|
||||
- just replaces metadata m_sampleLayout
|
||||
- one dimension may be specified as 0 and will be inferred
|
||||
- optional beginDim/endDim denote to only replace a sub-range of dims, for implementing ReshapeDimension() and FlattenRank()
|
||||
- may not be applied to time; use Permute() or Transpose()
|
||||
- ReshapeDimension(x, dim, tensorShape) = Reshape(x, tensorShape, beginDim=dim, endDim=dim+1)
|
||||
- reinterprets one dimension as multiple, where the number of elements remains the same
|
||||
- one of the new dimensions may be specified as 0 and will be inferred
|
||||
- FlattenDimensions(x, dim, num) = Reshape(x, 0, beginDim=dim, endDim=dim+1)
|
||||
- replace two or more consecutive dims by a single dim with the same number of elements
|
||||
- SplitDimension(x, dim, N) = ReshapeDimension(x, dim, 0:N)
|
||||
- splits a dimension into a new tensor dimension, injecting them into a new dimension
|
||||
- to split stacked frames into a new time dimension:
|
||||
insert new time dim with ReshapeDimension(., -1, 0:1), SplitDimension(., dim, N), Transpose(., dim+1, -1), then Select(., dim+1, 0) away the new time dim
|
||||
This would make 4 copies presently. We may need a compound C++ node for now.
|
||||
- note: to split into multiple outputs (like tf.split()), use a BrainScript loop with Slice().
|
||||
- Slicing --all implemented in C++ by SliceNode
|
||||
- Slice(x, dim, begin, end, stride=1, phase=0)
|
||||
- reduces a dim to index range [begin,end)
|
||||
- negative bounds specify "from end" (end=0 means end if stride>0, and begin=0 means end if stride<0)
|
||||
- also applies to time, e.g.:
|
||||
- pick last frame of a sequence (for s2s): Slice(x, -1, -1, 0) // first -1 is dim and means the time index
|
||||
- trim first and last 3 frames of a sequence: Slice(x, -1, 3, -3) // 3 means begin at frame 3, -3 means end is 3rd frame from the end
|
||||
- this will update MBLayout
|
||||
- the optional stride and phase parameters are for implementing downsampling (stride>1) and reversing (begin=-1, stride=-1)
|
||||
- multiple slice operations can be combined by concatenating the spec vector, e.g. Slice(x, dim1:dim2, begin1:begin2, end1:end2)
|
||||
- today's RowSlice(begin, num, x) = Slice(x, 1, begin, begin + num)
|
||||
- like torch.narrow()
|
||||
- can implement TF unpack() and Torch split() as a BrainScript loop with multiple Slice() operations
|
||||
- internally implemented by tensor lib opCopy with manipulated m_strides/m_offset
|
||||
- Select(x, dim, index) = FlattenDimensions(Slice(x, dim, index, index+1), index > 1 ? index-1 : index, index > 1 ? index : index+1)
|
||||
- narrow dim to a single index, then drop the dim. Result will have one dim less.
|
||||
- like torch.select()
|
||||
- can implement squeezing a dim-1 dim: Select(x, dim:0)
|
||||
- Squeeze(x, dim) = Select(x, dim, 0)
|
||||
- Splicing: --all implemented in C++ by SpliceNode
|
||||
- Splice(inputs, dim)
|
||||
- splice multiple inputs inputs[0]:inputs[1]:... along given dim (=RowStack for vectors)
|
||||
- inputs must have identical dimensions except for:
|
||||
- the specified dim
|
||||
- broadcasting dimensions (e.g. used to implement Pad())
|
||||
- one can splice in time
|
||||
- e.g. prepend a vector to a time sequence
|
||||
- this will create a new MBLayout
|
||||
- like tf.concat()
|
||||
- Pack(inputs, dim) = ReshapeDimension(Splice(inputs, dim), dim, (0:Length(inputs)) )
|
||||
- like splice but creates inserts new dim of dimension Length(inputs)
|
||||
- inputs must have identical dimensions for all dims (except for broadcasting)
|
||||
- dim can be a time dimension; then a new inner-most time dimension will be inserted
|
||||
- like tf.pack()
|
||||
- Pad(x, dim, howManyBefore, howManyAfter, with=0) = Splice(Constant(with, tensorShape=1*(dim-1):howManyBefore), x, Constant(with, tensorShape=1*(dim-1):howManyAfter), dim)
|
||||
- inverse of slice, pad with a constant value
|
||||
- dimensions specified relative, can pad at start and end
|
||||
- in time: pad neighbor frames
|
||||
- Repeat(x, dim, numRepeats) = Splice(x*numRepeats, dim)
|
||||
- generalizes CNTK RowRepeat(x, numRepeats) = Repeat(x, 1, numRepeats)
|
||||
- to repeat multiple, specify vectors, e.g. Repeat(x, dim1:dim2, numRepeats1:numRepeats2)
|
||||
- like tf.tile() and Matlab's repmat()
|
||||
- Transposition (permuting dims): --implemented in C++ by PermuteDimensionsNode
|
||||
- PermuteDimensionsOf(x, dim1:dim2:...:dimN)
|
||||
- dims are rotated to dim2:dim3:...:dimN:dim1; other dims remain untouched
|
||||
To rotate the other way round, specify them in opposite order.
|
||||
We specify it this way to be able to reference the time dimension without having to know the rank of the m_sampleLayout.
|
||||
- time dims must have a constant duration for all items in the minibatch
|
||||
- internally implemented with tensor lib by shuffling dimensions with their strides --TODO: check if TensorShape optimization is still correct
|
||||
- Transpose(x, dim1, dim2) = PermuteDimensions(x, dim1:dim2)
|
||||
- any two dimensions; including time (must have constant duration)
|
||||
- like torch.transpose()
|
||||
- Re-indexing: --implemented by ReindexRankNode and SliceNode
|
||||
- ReindexDimension(x, dim, indexVector)
|
||||
- splice x[..., indexVector[0], ...], x[..., indexVector[1], ...], etc. with indexVector[.] at given dim
|
||||
- indexVector must be invertible if it is intended to backpropagate through this node
|
||||
- DownsampleDimension(x, dim, n, phase=0) = Slice(x, dim, 0, 0, stride=n)
|
||||
- select every n-th element, starting with index 'phase'
|
||||
- time dims allowed. Phase is then a modulus w.r.t. where a sequence is inside the minibatch (may require a ReconcileLayout() before to match layouts)
|
||||
- ReverseDimension(x, dim) = Slice(x, dim, -1, 0, stride=-1)
|
||||
- reverses the direction of a dim
|
||||
- when applied to time dims, this creates a new layout (which is also flipped)
|
||||
|
||||
- misc.:
|
||||
- note: much would look more natural if we had OO syntax, e.g. x.Slice(dim, begin, end).FlattenDimensions(...)
|
||||
Could be done by exposing all methods on ComputationNode... not currently feasible with BrainScript, but e.g. with Python bindings
|
||||
- torch.unfold (dim, size, step)
|
||||
- create a convolution matrix (stride magic)
|
||||
- CyclicallyPermuteRank(x, dim, step)
|
||||
- rotates indices
|
||||
- also applies to time dimensions
|
||||
- duplicate elements
|
||||
- Gather
|
||||
- from Torch and TF
|
||||
- TF also has:
|
||||
- 'gather': reindexing
|
||||
- 'dynamic_partition', 'dynamic_stitch'
|
||||
- Torch:
|
||||
- expand (dim, range): broadcasts dimension 'dim' as a new dimension with 'range'. Not needed I think.
|
||||
- repeatTensor: like tile but with weird reshaping
|
||||
- squeeze: removes all singleton dimensions, or a specific one. We can remove a specific one with Select().
|
||||
- TODO:
|
||||
- give names to dimensions?
|
||||
- do we want to allow time offsets in layouts?
|
||||
|
||||
reductions
|
||||
----------
|
||||
|
||||
- ReduceSum
|
||||
- sum over all elements of a dimension, or over time
|
||||
- ReduceMax
|
||||
- max
|
||||
- ReduceMean
|
||||
- av
|
||||
- ArgMax, ArgMin
|
||||
- we already have that somewhere, for evaluation
|
||||
- All, Any
|
||||
- logical test --must be done over sequences
|
||||
- TF also has:
|
||||
- reduce_prod, reduce_min
|
||||
- segment_sum etc.; we use sequences
|
||||
- listdiff
|
||||
- where: indices of 'true' values -> 2D tensor of coordinates
|
||||
- unique (1D only)
|
||||
- edit_distance
|
||||
- invert_permutation: invert a permutation index vector
|
||||
- top_k
|
||||
|
||||
convolutions
|
||||
------------
|
||||
|
||||
- convolution
|
||||
- convolution with filter
|
||||
- max pool (=convolution with weights 1 and max reduction)
|
||||
- av pool (=convolution with uniform filter)
|
||||
- also in time: by specifying more filter dimensions [TODO]
|
||||
- tricky bit: boundaries; may need expansion or reduction of sequences
|
||||
|
||||
element-wise operations
|
||||
-----------------------
|
||||
|
||||
- PlusNode, MinusNode, ElementTimes
|
||||
- with broadcasting, these implement:
|
||||
- PlusNode with bias, PlusNode for images
|
||||
- 1-x
|
||||
- ScaleNode, RowElementTimes, ColumnElementTimes
|
||||
- elementwise nonlinearities as usual [TODO: complete them]
|
||||
- logical ops (can be done by comparison ops actually)
|
||||
- Clamp
|
||||
- bounds are passed as 'Const'
|
||||
- TF: in_top_k
|
||||
- Torch performs these ops (e.g. add) as vector, without broadcasting
|
||||
- e.g. max reduces, while cmax does not. Our solution is better... really? How to specify reduce?
|
||||
|
||||
gradient operations
|
||||
-------------------
|
||||
|
||||
- TF: are nodes, e.g. clip_by_value
|
||||
- input should be parameters as well, so they can be computed
|
||||
- need a node to stop gradient propagation?
|
||||
- can we use nodes to specify things like AdaGrad and momentum?
|
||||
|
||||
debugging
|
||||
---------
|
||||
|
||||
- node that prints activations
|
||||
- node that prints mean/var of gradients
|
||||
|
||||
other
|
||||
-----
|
||||
|
||||
- per-node learning rate: can specify additional parameter for each node? Maybe fold with updateLearnableParameter?
|
||||
- give dimensions a name?
|
||||
- can we interleave variable-length ones? Concat into a single dimensions, using strides?
|
||||
|
||||
*/
|
||||
|
||||
}}}
|
||||
|
|
|
@ -1367,6 +1367,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
RequestMatrixFromPool(m_softmaxOfRight, matrixPool);
|
||||
RequestMatrixFromPool(m_gammaFromLattice, matrixPool);
|
||||
}
|
||||
|
||||
// Release gradient and temp matrices that are no longer needed after all the children's gradients are computed.
|
||||
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
|
||||
ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
|
||||
ReleaseMatrixToPool(m_gammaFromLattice, matrixPool);
|
||||
}
|
||||
|
||||
// TODO: method names should be CamelCase
|
||||
std::vector<shared_ptr<const msra::dbn::latticepair>> * getLatticePtr()
|
||||
{
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
|
||||
MATH_API DEVICEID_TYPE EnforceOneGPUOnly(DEVICEID_TYPE requestedDeviceId);
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ElementWiseOperator -- This enum represents which function to apply.
|
||||
|
@ -48,41 +48,52 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
enum ElementWiseOperator
|
||||
{
|
||||
// nullary
|
||||
opConstOne,
|
||||
// unary (or binary with constant parameter)
|
||||
opCopy,
|
||||
opNegate, opNot,
|
||||
opAbs,
|
||||
opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine,
|
||||
// these are not implemented yet:
|
||||
opSaturateBetaAlpha, opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha,
|
||||
opSigmoid, opTanh, opSqrt, opExp, opLog, opLinearRectifier, opCosine,
|
||||
// unary ops for use by Matrix class only (there is no TensorView implementation)
|
||||
opSigmoidDerivative, opLinearRectifierDerivative, opNegativeSine,
|
||||
// binary
|
||||
opSum, opDifference, opElementwiseProduct, opElementwiseQuotient,
|
||||
opLogSum, opMax, opMin,
|
||||
opEQ, opNE, opGT, opLT, opGE, opLE,
|
||||
opAnd, opOr, opXor,
|
||||
opMaskNegative,
|
||||
opElementwiseProductWithSigmoidDerivativeFromOutput, opElementwiseProductWithTanhDerivativeFromOutput,
|
||||
opElementwiseProductWithLinearRectifierDerivativeFromOutput, opElementwiseProductWithLogDerivativeFromOutput, opElementwiseProductWithCosDerivative,
|
||||
// binary ops for indexing
|
||||
//opIndex,
|
||||
// ternary
|
||||
opCond
|
||||
// Note: not all of the above are actually implement at present; and not all that's implemented has an opcode.
|
||||
opCond/*a ? b : c*/, opClip/*clip a within interval b..c*/
|
||||
// Note: not all that's implemented in CNTK ComputationNodes has an opcode yet.
|
||||
};
|
||||
|
||||
// helper to apply a C macro for all operations of each kind
|
||||
#define ForAllNullaryOps(Macro) \
|
||||
Macro(ConstOne);
|
||||
|
||||
#define ForAllUnaryOps(Macro) \
|
||||
Macro(Copy); \
|
||||
Macro(Negate); Macro(Not); \
|
||||
Macro(Abs); \
|
||||
Macro(Sigmoid); Macro(SigmoidDerivative); Macro(Tanh); Macro(Sqrt); Macro(Exp); Macro(Log); Macro(LinearRectifierDerivative); Macro(Cosine); Macro(NegativeSine);
|
||||
|
||||
#define ForAllParameterizedUnaryOps(Macro) \
|
||||
Macro(SaturateBetaAlpha); Macro(SumAlpha); Macro(SubDifferenceToAlpha); Macro(SubDifferenceFromAlpha);
|
||||
Macro(Sigmoid); Macro(Tanh); Macro(Sqrt); Macro(Exp); Macro(Log); Macro(LinearRectifier); Macro(Cosine);
|
||||
|
||||
#define ForAllBinaryOps(Macro) \
|
||||
Macro(Sum); Macro(Difference); Macro(ElementwiseProduct); Macro(ElementwiseQuotient); \
|
||||
Macro(LogSum); Macro(Max); Macro(Min); \
|
||||
Macro(EQ); Macro(NE); Macro(GT); Macro(LT); Macro(GE); Macro(LE); \
|
||||
Macro(MaskNegative);
|
||||
Macro(And); Macro(Or); Macro(Xor);\
|
||||
Macro(MaskNegative); \
|
||||
Macro(ElementwiseProductWithSigmoidDerivativeFromOutput); Macro(ElementwiseProductWithTanhDerivativeFromOutput); \
|
||||
Macro(ElementwiseProductWithLinearRectifierDerivativeFromOutput); Macro(ElementwiseProductWithLogDerivativeFromOutput); Macro(ElementwiseProductWithCosDerivative); \
|
||||
//Macro(Index);
|
||||
|
||||
#define ForAllTernaryOps(Macro) \
|
||||
Macro(Cond);
|
||||
Macro(Cond); Macro(Clip);
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// various enums to describe
|
||||
|
|
|
@ -51,6 +51,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// GPU and 1-dimensional image
|
||||
bool gpuSparse1D = (inT.h() == 1 &&
|
||||
in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
|
||||
convDesc.wStride() == 1 &&
|
||||
!convDesc.padding() &&
|
||||
in.GetMatrixType() == MatrixType::SPARSE);
|
||||
|
||||
out.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);
|
||||
|
@ -67,8 +69,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t startSampleId = i * subBatchSize;
|
||||
size_t endSampleId = min(batchSize, startSampleId + subBatchSize);
|
||||
size_t smallBatchSize = endSampleId - startSampleId;
|
||||
|
||||
workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
|
||||
Mat inputSubBatch;
|
||||
|
||||
// We optimize for three different scenarios here by handling them slightly differently.
|
||||
|
@ -78,10 +78,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (in.GetMatrixType() == MatrixType::DENSE)
|
||||
inputSubBatch = in.ColumnSlice(startSampleId, smallBatchSize);
|
||||
else
|
||||
{
|
||||
inputSubBatch.SetValue(in.ColumnSlice(startSampleId, smallBatchSize), in.GetFormat());
|
||||
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, true);
|
||||
}
|
||||
|
||||
if (gpuSparse1D)
|
||||
{
|
||||
|
@ -94,6 +91,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
else
|
||||
{
|
||||
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, true);
|
||||
workspace.AssignPackedConvolutionInput(inputSubBatch,
|
||||
inT.w(), inT.h(), inT.c(),
|
||||
outT.w(), outT.h(), outT.c(),
|
||||
|
@ -101,6 +99,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
convDesc.padding());
|
||||
|
||||
Mat outputSubBatch = out.ColumnSlice(outputSizePerChannel * startSampleId, outputSizePerChannel * smallBatchSize);
|
||||
|
||||
workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
|
||||
Mat::Multiply(filter, false, workspace, false, outputSubBatch);
|
||||
}
|
||||
}
|
||||
|
@ -197,6 +197,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// GPU and 1-dimensional image
|
||||
bool gpuSparse1D = (inT.h() == 1 &&
|
||||
in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
|
||||
convDesc.wStride() == 1 &&
|
||||
!convDesc.padding() &&
|
||||
in.GetMatrixType() == MatrixType::SPARSE);
|
||||
|
||||
if (numSubBatches == 1 && allowReuse && !gpuSparse1D) //reuse packed input from evaluation step if it's not changed by either subbatch or recurrent steps.
|
||||
|
@ -209,18 +211,40 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t startSampleID = i * subBatchSize;
|
||||
size_t endSampleID = min(batchSize, startSampleID + subBatchSize);
|
||||
size_t smallBatchSize = endSampleID - startSampleID;
|
||||
|
||||
workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
|
||||
Matrix<ElemType> inputSubBatch = in.ColumnSlice(startSampleID, smallBatchSize);
|
||||
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, inputSubBatch.GetFormat(), true);
|
||||
workspace.AssignPackedConvolutionInput(inputSubBatch,
|
||||
inT.w(), inT.h(), inT.c(),
|
||||
srcGradT.w(), srcGradT.h(), srcGradT.c(),
|
||||
filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
|
||||
convDesc.padding());
|
||||
|
||||
Matrix<ElemType> outputGradientSubBatch = srcGradTmp.ColumnSlice(startSampleID * outputSizePerChannel, smallBatchSize * outputSizePerChannel);
|
||||
Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, filter);
|
||||
|
||||
// We optimize for three different scenarios here by handling them slightly differently.
|
||||
// [Scenario 1] Dense: Unroll using AssignPackedConvolutionInput and multiply.
|
||||
// [Scenario 2] Sparse 1-D convolution on GPU: for text scenarios we have a specific kernel.
|
||||
// [Scenario 3] Sparse all others: convert to dense. Temporary work-around - allocating/de-allocating memory is costly!
|
||||
if (gpuSparse1D)
|
||||
{
|
||||
Matrix<ElemType> inputSubBatch;
|
||||
inputSubBatch.SetValue(in.ColumnSlice(startSampleID, smallBatchSize));
|
||||
inputSubBatch.Reshape(inT.c(), smallBatchSize * inT.w());
|
||||
Matrix<ElemType> inputSubBatchSparseReordered(inputSubBatch.GetNumCols(), inputSubBatch.GetNumRows(), inputSubBatch.GetDeviceId(), MatrixType::SPARSE, MatrixFormat::matrixFormatSparseCSC);
|
||||
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, inT.w(), 1, smallBatchSize, inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);
|
||||
|
||||
Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * srcGradT.w(), srcGradT.c(), outputGradientSubBatch.GetDeviceId());
|
||||
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, srcGradT.w(), 1, smallBatchSize, srcGradT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);
|
||||
|
||||
filter.Reshape(srcGradT.c() * filterT.w(), inT.c());
|
||||
Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, filter, smallBatchSize, convDesc.wStride(), convDesc.padding(), false);
|
||||
filter.Reshape(srcGradT.c(), inT.c() * filterT.w());
|
||||
}
|
||||
else
|
||||
{
|
||||
workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
|
||||
Matrix<ElemType> inputSubBatch = in.ColumnSlice(startSampleID, smallBatchSize);
|
||||
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, inputSubBatch.GetFormat(), true);
|
||||
workspace.AssignPackedConvolutionInput(inputSubBatch,
|
||||
inT.w(), inT.h(), inT.c(),
|
||||
srcGradT.w(), srcGradT.h(), srcGradT.c(),
|
||||
filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
|
||||
convDesc.padding());
|
||||
|
||||
Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, filter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -239,7 +263,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
|
||||
assert(outT.n() == out.GetNumCols());
|
||||
|
||||
Mat o = out.ColumnSlice(0, out.GetNumCols());
|
||||
Mat o = out.ColumnSlice(0, out.GetNumCols()); // same as .AsReference()
|
||||
Mat d = dst.Reshaped(biasT.c(), outT.w() * outT.h() * outT.n());
|
||||
d.AssignSumOf(o.Reshaped(biasT.c(), outT.w() * outT.h() * outT.n()), bias);
|
||||
}
|
||||
|
@ -410,23 +434,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
};
|
||||
|
||||
template<class ElemType>
|
||||
std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, EngineType engType)
|
||||
std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind)
|
||||
{
|
||||
if (engType == EngineType::Auto)
|
||||
{
|
||||
// REVIEW alexeyk: make cuDNN default when running on GPU and compiled with cuDNN, add config parameter to enable runtime switch between implementations.
|
||||
if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId))
|
||||
return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>();
|
||||
return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>();
|
||||
if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId) && imageLayoutKind == ImageLayoutKind::CHW)
|
||||
return Create(deviceId, EngineType::CuDnn, imageLayoutKind);
|
||||
else
|
||||
return Create(deviceId, EngineType::Legacy, imageLayoutKind);
|
||||
}
|
||||
else if (engType == EngineType::CuDnn)
|
||||
{
|
||||
if (imageLayoutKind != ImageLayoutKind::CHW)
|
||||
InvalidArgument("ConvolutionEngineFactory: ImageLayout '%s' is not compatible with the cuDNN engine.", ToString(imageLayoutKind).c_str());
|
||||
if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId))
|
||||
return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>();
|
||||
RuntimeError("cuDNN convolution engine is not supported, check the device id and whether the code was compiled with cuDNN.");
|
||||
}
|
||||
else if (engType == EngineType::Legacy)
|
||||
{
|
||||
if (imageLayoutKind != ImageLayoutKind::HWC)
|
||||
InvalidArgument("ConvolutionEngineFactory: ImageLayout '%s' is not compatible with the legacy convolution engine.", ToString(imageLayoutKind).c_str());
|
||||
return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>();
|
||||
}
|
||||
|
||||
RuntimeError("Not supported convolution engine type: %d.", engType);
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#endif
|
||||
|
||||
#include "Matrix.h"
|
||||
#include "TensorShape.h" // for ImageLayoutKind
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -252,7 +253,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
virtual PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId) = 0;
|
||||
|
||||
enum class EngineType { Auto, CuDnn, Legacy };
|
||||
static std::unique_ptr<ConvolutionEngineFactory<ElemType>> Create(DEVICEID_TYPE deviceId, EngineType engType = EngineType::Auto);
|
||||
static std::unique_ptr<ConvolutionEngineFactory<ElemType>> Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind);
|
||||
|
||||
public:
|
||||
ConvolutionEngineFactory(const ConvolutionEngineFactory&) = delete;
|
||||
|
|
|
@ -10,11 +10,7 @@
|
|||
#ifdef USE_CUDNN
|
||||
#include <cudnn.h>
|
||||
|
||||
template<> const char* CudaErrString(cudnnStatus_t x)
|
||||
{
|
||||
return cudnnGetErrorString(x);
|
||||
}
|
||||
#define CUDNN_CALL(expr) (CudaCall((expr), #expr, "cuDNN", CUDNN_STATUS_SUCCESS))
|
||||
template<> const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x) { return cudnnGetErrorString(x); }
|
||||
|
||||
// A note on the formats: CNTK originally used NHWC for input/output tensors and CHWN for filters.
|
||||
// Such formats have very limited support in cuDNN and not used in other frameworks.
|
||||
|
|
|
@ -5,25 +5,27 @@
|
|||
//
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "Basics.h"
|
||||
#include "BestGpu.h"
|
||||
#include "DebugUtil.h"
|
||||
|
||||
#ifndef CPUONLY
|
||||
|
||||
#include "cublas_v2.h"
|
||||
#include "Basics.h"
|
||||
#include "GPUMatrix.h"
|
||||
#include "GPUMatrixCUDAKernels.cuh"
|
||||
#include "GPUSparseMatrix.h"
|
||||
#include "GPUTensor.h"
|
||||
#include "CommonMatrix.h"
|
||||
#define TENSOR_OPS_DECL __device__ __host__
|
||||
#include "TensorOps.h"
|
||||
#include "device_launch_parameters.h"
|
||||
#include <assert.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <curand.h>
|
||||
#include <curand_kernel.h>
|
||||
#include "cublas_v2.h"
|
||||
#include <assert.h>
|
||||
#include <memory>
|
||||
|
||||
#pragma comment (lib, "cudart.lib") // instruct linker to reference these libs
|
||||
#pragma comment (lib, "cublas.lib")
|
||||
|
@ -47,8 +49,6 @@ bool do_sync = true;
|
|||
#ifdef _WIN32
|
||||
// thread local storage to access the current stream, initalize to default stream
|
||||
__declspec (thread)
|
||||
#else
|
||||
static
|
||||
#endif
|
||||
cudaStream_t t_stream = cudaStreamDefault;
|
||||
|
||||
|
@ -78,9 +78,9 @@ cudaStream_t MATH_API GetStream()
|
|||
performElementWiseFunction(ElementWiseOperator::op##f, a.m_pArray); \
|
||||
return *this; }
|
||||
|
||||
static const char * CudaErrString(cudaError_t x) { cudaDeviceSynchronize(); return cudaGetErrorString(x); }
|
||||
static const char * CudaErrString(cublasStatus_t) { cudaDeviceSynchronize(); return "(see cublas_api.h & look for cublasStatus_t or CUBLAS_STATUS_xxx)"; }
|
||||
static const char * CudaErrString(curandStatus) { cudaDeviceSynchronize(); return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)"; }
|
||||
template<> const char * CudaErrString<cudaError_t>(cudaError_t x) { cudaDeviceSynchronize(); return cudaGetErrorString(x); }
|
||||
template<> const char * CudaErrString<cublasStatus_t>(cublasStatus_t) { cudaDeviceSynchronize(); return "(see cublas_api.h & look for cublasStatus_t or CUBLAS_STATUS_xxx)"; }
|
||||
template<> const char * CudaErrString<curandStatus>(curandStatus) { cudaDeviceSynchronize(); return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)"; }
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -384,7 +384,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
#pragma region Constructors and Destructor
|
||||
|
||||
//should only be used by constructors.
|
||||
// should only be used by constructors
|
||||
template<class ElemType>
|
||||
void GPUMatrix<ElemType>::ZeroInit(int deviceId)
|
||||
{
|
||||
|
@ -449,13 +449,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_numRows = moveFrom.m_numRows;
|
||||
m_numCols = moveFrom.m_numCols;
|
||||
m_computeDevice = moveFrom.m_computeDevice;
|
||||
m_pArray = moveFrom.m_pArray; //shallow copy the pointer
|
||||
m_pArray = moveFrom.m_pArray; // shallow copy the pointer
|
||||
m_matrixName=moveFrom.m_matrixName;
|
||||
m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
|
||||
m_format = moveFrom.m_format;
|
||||
m_externalBuffer = moveFrom.m_externalBuffer;
|
||||
|
||||
//release the pointer from the source object so that the destructor won't release it twice
|
||||
// release the pointer from the source object so that the destructor won't release it twice
|
||||
moveFrom.ZeroInit(0);
|
||||
}
|
||||
|
||||
|
@ -477,10 +477,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
if (this != &moveFrom)
|
||||
{
|
||||
if (OwnBuffer() && m_pArray!=NULL)
|
||||
{
|
||||
if (OwnBuffer() && m_pArray)
|
||||
CUDA_CALL(cudaFree(m_pArray));
|
||||
}
|
||||
|
||||
m_numRows = moveFrom.m_numRows;
|
||||
m_numCols = moveFrom.m_numCols;
|
||||
|
@ -500,8 +498,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
GPUMatrix<ElemType>::~GPUMatrix(void)
|
||||
{
|
||||
Clear();
|
||||
if (m_workspace != nullptr)
|
||||
delete m_workspace;
|
||||
delete m_workspace;
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
|
@ -3259,6 +3256,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
#pragma endregion Other helper functions
|
||||
|
||||
#pragma region Static BLAS Functions
|
||||
// float/double overloads of cublasSgemm()/cublasDgemm()
|
||||
static cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc)
|
||||
{
|
||||
return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
|
||||
}
|
||||
static cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc)
|
||||
{
|
||||
return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void GPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB,
|
||||
ElemType beta, GPUMatrix<ElemType>& c)
|
||||
|
@ -3278,28 +3285,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (beta == 0)
|
||||
c.Resize(m,n);
|
||||
else
|
||||
c.VerifySize(m, n); // Can't resize if beta != 0
|
||||
c.VerifySize(m, n); // Can't resize if beta != 0
|
||||
|
||||
if (!(m>0 && k>0 && l>0 && n>0))
|
||||
{
|
||||
RuntimeError("!(m>0 && k>0 && l>0 && n>0)"); //converting from size_t to int may cause overflow
|
||||
}
|
||||
if (k!=l)
|
||||
{
|
||||
RuntimeError("matrix dim mismatch in MultiplyAndWeightedAdd");
|
||||
}
|
||||
if (sizeof(ElemType)==sizeof(float))
|
||||
{
|
||||
CUBLAS_CALL(cublasSgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast<float*>(&alpha),reinterpret_cast<float*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<float*>(b.m_pArray),(int)b.m_numRows,reinterpret_cast<float*>(&beta),reinterpret_cast<float*>(c.m_pArray),(int)c.m_numRows));
|
||||
}
|
||||
else if (sizeof(ElemType)==sizeof(double))
|
||||
{
|
||||
CUBLAS_CALL(cublasDgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast<double*>(&alpha),reinterpret_cast<double*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<double*>(b.m_pArray),(int)b.m_numRows,reinterpret_cast<double*>(&beta),reinterpret_cast<double*>(c.m_pArray),(int)c.m_numRows));
|
||||
}
|
||||
else
|
||||
{
|
||||
RuntimeError("Unsupported template argument in GPUMatrix");
|
||||
}
|
||||
CUBLAS_CALL(cublas_gemm(cuHandle, transA, transB, m, n, k, &alpha, a.m_pArray, (int)a.m_numRows, b.m_pArray, (int)b.m_numRows, &beta, c.m_pArray, (int)c.m_numRows));
|
||||
c.m_numRows=m;
|
||||
c.m_numCols=n;
|
||||
}
|
||||
|
@ -4436,396 +4428,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
CUDA_CALL(cudaFree(d_zeta));
|
||||
};
|
||||
|
||||
// =======================================================================
|
||||
// TensorView support
|
||||
// =======================================================================
|
||||
|
||||
// BUGBUG: This is a stub that currently is just the CPU code. This is not functional yet.
|
||||
|
||||
// To save time, this makes extensive use of templates and macros.
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// simple fixed-size arrays for passing dimension information by value
|
||||
// since CUDA can't just take our std::array and std::vector
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<typename T, size_t N>
|
||||
struct FixedArray
|
||||
{
|
||||
T m_data[N];
|
||||
__device__ __host__ size_t size() const { return N; }
|
||||
__device__ __host__ T & operator[](size_t n) { return m_data[n]; }
|
||||
__device__ __host__ T operator[](size_t n) const { return m_data[n]; }
|
||||
template<class VEC> FixedArray(const VEC & data) // construct from CPU-side STL array or vector
|
||||
{
|
||||
assert(data.size() == N);
|
||||
for (size_t n = 0; n < N; n++)
|
||||
{
|
||||
m_data[n] = (T)data[n];
|
||||
if (m_data[n] != data[n]) // overflow check
|
||||
InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
|
||||
}
|
||||
}
|
||||
};
|
||||
template<typename T> // specialized version for 0 elements
|
||||
struct FixedArray<T, 0>
|
||||
{
|
||||
__device__ __host__ size_t size() const { return 0; }
|
||||
template<class VEC> FixedArray(const VEC & data) { assert(data.size() == 0); UNUSED(data); }
|
||||
};
|
||||
|
||||
template<typename T, size_t N, size_t K> // N = which input/output; K = index depth
|
||||
struct FixedMatrix
|
||||
{
|
||||
T m_data[N][K];
|
||||
__device__ __host__ size_t getNumRows() const { return N; }
|
||||
__device__ __host__ size_t getNumCols() const { return K; }
|
||||
__device__ __host__ T & operator()(size_t n, size_t k) { return m_data[n][k]; }
|
||||
__device__ __host__ T operator()(size_t n, size_t k) const { return m_data[n][k]; }
|
||||
template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data) // construct from CPU-side array of vectors
|
||||
{
|
||||
assert(data.size() == N);
|
||||
for (size_t n = 0; n < N; n++)
|
||||
{
|
||||
assert(data[n].size() == K);
|
||||
for (size_t k = 0; k < K; k++)
|
||||
{
|
||||
m_data[n][k] = (T)data[n][k];
|
||||
if (m_data[n][k] != data[n][k]) // overflow check
|
||||
InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
template<typename T, size_t N> // specialized version for 0 elements
|
||||
struct FixedMatrix<T, N, 0>
|
||||
{
|
||||
__device__ __host__ size_t getNumRows() const { return N; }
|
||||
__device__ __host__ size_t getNumCols() const { return 0; }
|
||||
template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data) { assert(data.size() == N); for (size_t n = 0; n < N; n++) assert(data[n].size() == 0); UNUSED(data); }
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// function to actually compute a function of (N-1) inputs based on the opcode
|
||||
// TensorView entry points from Matrix.cpp
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// helper to provide a vector of ones of at least the given number of elements
|
||||
// TODO: Use this to implement ComputationNode::ConstOnes? Or do we even need that anymore?
|
||||
template<class ElemType>
|
||||
struct TensorOps
|
||||
static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE deviceId)
|
||||
{
|
||||
static __device__ ElemType Compute(const FixedArray<ElemType*, 2> & pointers, ElementWiseOperator op)
|
||||
// using an array of shared_ptrs because those are thread-safe. The objects themselves are immutable.
|
||||
// And using a plain array so this will never get freed, avoiding free-after-DLL-unload issues.
|
||||
static shared_ptr<GPUMatrix<ElemType>> onesCache[32]; // cache of objects
|
||||
if (deviceId >= _countof(onesCache))
|
||||
LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", (int)_countof(onesCache), (int)deviceId+1);
|
||||
auto p = onesCache[deviceId];
|
||||
if (!p || p->GetNumRows() < N) // must (re-)allocate
|
||||
{
|
||||
ElemType a = *(pointers[0]);
|
||||
#define CaseUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a)
|
||||
switch (op)
|
||||
{
|
||||
ForAllUnaryOps(CaseUnaryTensorOp);
|
||||
default: return 0; // (failure)
|
||||
}
|
||||
p = make_shared<GPUMatrix<ElemType>>(GPUMatrix<ElemType>::Ones(N, 1, deviceId));
|
||||
onesCache[deviceId] = p; // this will replace the pointer thread-safely (although weird race conditions may happen where a larger entry is overwritten by a smaller one; will still run correctly)
|
||||
}
|
||||
static __device__ ElemType Compute(const FixedArray<ElemType*, 3> & pointers, ElementWiseOperator op)
|
||||
{
|
||||
ElemType a = *(pointers[0]);
|
||||
ElemType b = *(pointers[1]);
|
||||
#define CaseBinaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b)
|
||||
switch (op)
|
||||
{
|
||||
ForAllBinaryOps(CaseBinaryTensorOp); // note: this costs about 6% compared to having only a single case
|
||||
default: return 0; // (failure)
|
||||
}
|
||||
}
|
||||
static __device__ ElemType Compute(const FixedArray<ElemType*, 4> & pointers, ElementWiseOperator op)
|
||||
{
|
||||
ElemType a = *(pointers[0]);
|
||||
ElemType b = *(pointers[1]);
|
||||
ElemType c = *(pointers[2]);
|
||||
#define CaseTernaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b,c)
|
||||
switch (op)
|
||||
{
|
||||
ForAllTernaryOps(CaseTernaryTensorOp);
|
||||
default: return 0; // (failure)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// function to compute the value for a given output location (perform reduction if needed)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#define C_size_t CUDA_LONG
|
||||
#define C_int CUDA_LONG
|
||||
#define C_unsigned_int CUDA_LONG
|
||||
|
||||
template<class ElemType, C_size_t N, C_int M, C_int m>
|
||||
struct TensorOpReduce
|
||||
{
|
||||
// this version for m >= 0
|
||||
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
|
||||
{
|
||||
// start with index 0
|
||||
// Using 'double' since we are memory-bound anyway.
|
||||
double/*ElemType*/ aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
|
||||
// apply this index to the pointers
|
||||
C_size_t dim = reducingOpDims[m];
|
||||
for (C_size_t k = 1/*done with k=0 already*/; k < dim; k++)
|
||||
{
|
||||
// bump the pointers
|
||||
for (C_size_t i = 0; i < N; i++)
|
||||
pointers[i] += reducingStrides(i,(C_size_t)m);
|
||||
ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
|
||||
aggregate += val;
|
||||
}
|
||||
return (ElemType)aggregate;
|
||||
}
|
||||
};
|
||||
|
||||
// this one terminates the template recursion over reduction dimensions
|
||||
// The pointers are pointing to the input element.
|
||||
template<class ElemType, C_size_t N, C_int M>
|
||||
struct TensorOpReduce<ElemType, N, M, /*m=*/-1>
|
||||
{
|
||||
// this version for m = -1
|
||||
// the pointers are pointing to the right location(s) to take the operation over
|
||||
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, M> & /*reducingOpDims*/, const FixedMatrix<C_int, N, M> & /*reducingStrides*/)
|
||||
{
|
||||
return TensorOps<ElemType>::Compute(pointers, op); // finally computing something!
|
||||
}
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// perform loop over regular index k for N-nary operations (N counting the output)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// The canonical case, vector op without reduction, is this PTX function:
|
||||
// _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi3ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
|
||||
// float ^ ^ aggregate loop
|
||||
// args? ^ ^ input dims
|
||||
// _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi2ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
|
||||
|
||||
// increment a pointer by a number of elements
|
||||
// This will later change into pre-scaled strides.
|
||||
template<class ElemType>
|
||||
static __device__ void IncPtr(ElemType * &p, C_int index, C_int stride)
|
||||
{
|
||||
//p = (ElemType*)(byteOffset + (char *)p);
|
||||
p = p + index * stride;
|
||||
return p;
|
||||
}
|
||||
|
||||
// The 'pointers' only refer to a single element, so we will bump them in-place to perform indexing.
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K, C_int k>
|
||||
struct TensorOpElement
|
||||
{
|
||||
// template-recursive version loops over indices
|
||||
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, K> & regularOpStrides, const FixedMatrix<C_int, N, K> & regularStrides,
|
||||
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
|
||||
{
|
||||
// map id (location on grid) to index[k]
|
||||
C_size_t stride = regularOpStrides[(C_size_t)k];
|
||||
C_size_t index = id / stride; // this dimension
|
||||
id = id % stride; // remaining dimensions inside this
|
||||
// apply this index to the pointers
|
||||
for (C_size_t i = 0; i < N; i++)
|
||||
pointers[i] += index * regularStrides(i,(C_size_t)k); // now this dimension is taken care of
|
||||
// process the previous index
|
||||
TensorOpElement<ElemType, N, M, K, k - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
};
|
||||
|
||||
// specialization for k=0 where stride is guaranteed to be 1
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K>
|
||||
struct TensorOpElement<ElemType, N, M, K, /*k=*/0>
|
||||
{
|
||||
// template-recursive version loops over indices
|
||||
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, K> & regularOpStrides, const FixedMatrix<C_int, N, K> & regularStrides,
|
||||
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
|
||||
{
|
||||
// map id (location on grid) to index[k]
|
||||
C_size_t index = id; // this dimension
|
||||
// apply this index to the pointers
|
||||
for (C_size_t i = 0; i < N; i++)
|
||||
pointers[i] += index * regularStrides(i,0); // now this dimension is taken care of
|
||||
// process the previous index
|
||||
TensorOpElement<ElemType, N, M, K, -1>::Compute(/*id*/0, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
};
|
||||
|
||||
// specialization for k = -1 terminates the template recursion
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K>
|
||||
struct TensorOpElement<ElemType, N, M, K, /*k=*/-1>
|
||||
{
|
||||
// template-recursion-teminating version computes the actual value for this output location
|
||||
// now the pointers point to the right element
|
||||
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, K> & /*regularOpStrides*/, const FixedMatrix<C_int, N, K> & /*regularStrides*/,
|
||||
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
|
||||
{
|
||||
// compute the operation for this output coordinate
|
||||
// This may still involve a reduction over inverse-broadcasting dimensions.
|
||||
ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
|
||||
// scale
|
||||
val *= alpha;
|
||||
// combine with previous value in target matrix, then write it out
|
||||
auto * pout = pointers[N - 1];
|
||||
if (beta != 0)
|
||||
val += beta * *pout;
|
||||
// save
|
||||
*pout = val;
|
||||
}
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// kernel and launch
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// the top-level kernel
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K>
|
||||
__global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
|
||||
FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides,
|
||||
FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides, CUDA_LONG numElements)
|
||||
{
|
||||
CUDA_LONG id = GridDim::GetLinearThreadId();
|
||||
if (id >= numElements)
|
||||
return;
|
||||
TensorOpElement<ElemType, N, M, K, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
// launch tensor op with CUDA
|
||||
// All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K>
|
||||
static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
|
||||
const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
|
||||
{
|
||||
// copy all parameters to CUDA-compatible data structures
|
||||
FixedArray<ElemType*, N> pointers(pointerVector);
|
||||
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
|
||||
C_size_t numElements = 1;
|
||||
for (C_size_t k = 0; k < regularOpDims.size(); k++)
|
||||
{
|
||||
regularOpStrideVector.push_back(numElements);
|
||||
numElements *= (C_size_t)regularOpDims[k];
|
||||
}
|
||||
FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
|
||||
FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
|
||||
FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
|
||||
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
|
||||
|
||||
CUDA_LONG NN = (CUDA_LONG)numElements;
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
GridDim grid(NN);
|
||||
_launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, NN);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
}
|
||||
|
||||
// for linear unary ops, we need to define a functor for every function for use as a template parameter (lambda syntax doesn't work in CUDA 7)
|
||||
#define DefineUnaryTensorFunctor(oper) \
|
||||
struct Functor ## oper { template<class ElemType> static __device__ ElemType f(ElemType a) { return Op ## oper(a); } };
|
||||
ForAllUnaryOps(DefineUnaryTensorFunctor);
|
||||
|
||||
// the top-level kernel for linear unary ops
|
||||
// Note: If we have a beta, we have 2 memory accesses, so this optimization may no longer be needed as we are memory-bound.
|
||||
template<class ElemType, class FN>
|
||||
__global__ void _launchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, CUDA_LONG numElements)
|
||||
{
|
||||
CUDA_LONG id = GridDim::GetLinearThreadId();
|
||||
if (id >= numElements)
|
||||
return;
|
||||
ElemType a = pa[id];
|
||||
ElemType val = FN::f(a);
|
||||
val *= alpha;
|
||||
if (beta != 0)
|
||||
val += beta * pb[id];
|
||||
pb[id] = val;
|
||||
}
|
||||
// version without beta and alpha
|
||||
template<class ElemType, class FN>
|
||||
__global__ void _launchUnaryTensorOp(const ElemType * pa, ElemType * pb, CUDA_LONG numElements)
|
||||
{
|
||||
CUDA_LONG id = GridDim::GetLinearThreadId();
|
||||
if (id >= numElements)
|
||||
return;
|
||||
ElemType a = pa[id];
|
||||
ElemType val = FN::f(a);
|
||||
pb[id] = val;
|
||||
}
|
||||
|
||||
// special case of linear unary operation
|
||||
template<class ElemType>
|
||||
static void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim)
|
||||
{
|
||||
CUDA_LONG NN = (CUDA_LONG)regularOpDim;
|
||||
|
||||
#define CaseLaunchUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: \
|
||||
if (beta == 0 && alpha == 1) \
|
||||
return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(pa, pb, NN); \
|
||||
else \
|
||||
return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pa, pb, alpha, NN);
|
||||
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
GridDim grid(NN);
|
||||
switch (op)
|
||||
{
|
||||
ForAllUnaryOps(CaseLaunchUnaryTensorOp);
|
||||
default: LogicError("LaunchTensorOp1: Unknown op code %d.", (int)op);
|
||||
}
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// map runtime parameters N to template parameters
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// tensor operation with k+1 dimensions (-1 means scalar)
|
||||
template<class ElemType, C_size_t N, C_int K>
|
||||
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrides,
|
||||
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
|
||||
{
|
||||
size_t dims = reducingOpDims.size();
|
||||
switch (dims)
|
||||
{
|
||||
case 2: return LaunchTensorOp<ElemType, N, 2, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1: return LaunchTensorOp<ElemType, N, 1, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0: return LaunchTensorOp<ElemType, N, 0, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (C_int)dims);
|
||||
}
|
||||
}
|
||||
|
||||
// tensor operation, generalized in number of arguments
|
||||
// This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
|
||||
template<class ElemType, C_size_t N>
|
||||
static void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const array<size_t, N> & offsets,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrides,
|
||||
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
|
||||
{
|
||||
for (C_size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
|
||||
pointers[i] += offsets[i];
|
||||
size_t dims = regularOpDims.size();
|
||||
switch (dims)
|
||||
{
|
||||
case 4: return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 3: return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 2: return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1: return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0: return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (C_int)dims);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// entry points from Matrix.cpp
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
|
||||
// This binds the N-ariness to a template parameter N, and gets the data pointers out from the matrix objects.
|
||||
template<class ElemType>
|
||||
|
@ -4844,6 +4469,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0)
|
||||
return LaunchUnaryTensorOp<ElemType>(beta, a.m_pArray + offsets[0], m_pArray + offsets[1], alpha, op, regularOpDims[0]);
|
||||
|
||||
// special case: recuding a matrix onto a column vector; can be done with SGEMM
|
||||
// Note: A minor risk is that with this, our own reduction function will rarely be used.
|
||||
// That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
|
||||
else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation
|
||||
#ifdef _DEBUG
|
||||
sizeof(ElemType) == sizeof(float) && // in debug don't shortcut 'double' so we have some test of our own codepath
|
||||
#endif
|
||||
regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && // we are processing a column
|
||||
reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t)regularOpDims[0]) // reducing across columns and no overlap
|
||||
{
|
||||
assert(reducingStrides[1][0] == 0);
|
||||
auto ARows = regularOpDims[0]; // vertical steps
|
||||
auto ACols = reducingOpDims[0]; // horizontal steps (reduction)
|
||||
auto ALd = reducingStrides[0][0]; // horizontal step width through matrix
|
||||
cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
|
||||
CUBLAS_CALL(cublas_gemm(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, (int)/*CRows=*/ARows, /*CCols=*/1, (int)ACols, &alpha,
|
||||
/*A00=*/a.m_pArray + offsets[0], (int)ALd,
|
||||
/*B00=*/GetOnesVector<ElemType>(ACols, a.GetComputeDeviceId())->m_pArray, (int)/*BRows=*/ACols, &beta,
|
||||
/*C00=*/m_pArray + offsets[1], (int)/*CRows=*/ARows));
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: Add a special case for tensor bias reduction. cudnn is ~7% faster on Image/QuickE2E.
|
||||
|
||||
// regular case
|
||||
else
|
||||
return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2> { a.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
|
@ -4859,6 +4508,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
a.PrepareDevice();
|
||||
if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId())
|
||||
InvalidArgument("All matrices must be on the same GPU");
|
||||
|
||||
return TensorOpN<ElemType, 3>(beta, array<ElemType*, 3> { a.m_pArray, b.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
|
@ -4875,7 +4525,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return TensorOpN<ElemType, 4>(beta, array<ElemType*, 4> { a.m_pArray, b.m_pArray, c.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
|
||||
// =======================================================================
|
||||
// explicit instantiations business
|
||||
// =======================================================================
|
||||
|
@ -4886,10 +4535,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template class DeviceBoundNumber<double>;
|
||||
|
||||
template<class ElemType>
|
||||
cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus]={0};
|
||||
cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus] = { 0 };
|
||||
|
||||
template<class ElemType>
|
||||
void* GPUMatrix<ElemType>::s_curandGenerator=NULL;
|
||||
void* GPUMatrix<ElemType>::s_curandGenerator = NULL;
|
||||
|
||||
// We use Matrix<char> as the backing store for QuantizedMatrix
|
||||
// Let's explicitly instantiate the methods we need for that purpose
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
#include "File.h"
|
||||
#include "Helpers.h"
|
||||
#include "CommonMatrix.h"
|
||||
#include "DataTensor.h" // only for SmallVector; I was hoping to keep this out
|
||||
#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
|
||||
#include "DebugUtil.h"
|
||||
#include "BestGpu.h" // for CPUONLY macro
|
||||
#include "ConcStack.h"
|
||||
|
@ -47,9 +47,7 @@ typedef struct CUstream_st *cudaStream_t;
|
|||
void MATH_API SetStream(cudaStream_t stream);
|
||||
cudaStream_t MATH_API GetStream();
|
||||
|
||||
namespace Microsoft {
|
||||
namespace MSR {
|
||||
namespace CNTK {
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// DeviceBoundNumber -- This class represents a number which resides on a particular device. Use it to avoid unnecessary transfers between CPU and GPU
|
||||
|
@ -506,7 +504,7 @@ namespace Microsoft {
|
|||
}}}
|
||||
|
||||
// Error handling
|
||||
template<typename ERRTYPE> static const char * CudaErrString(ERRTYPE x);
|
||||
template<typename ERRTYPE> const char * CudaErrString(ERRTYPE x); // actual error function is defined inside .cu files
|
||||
template<typename ERRTYPE> static void CudaCall(ERRTYPE retCode, const char * exprString, const char * libName, ERRTYPE successCode)
|
||||
{
|
||||
if (retCode != successCode)
|
||||
|
@ -523,7 +521,9 @@ template<typename ERRTYPE> static void CudaCall(ERRTYPE retCode, const char * ex
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define CUDA_CALL(expr) (CudaCall((expr), #expr, "CUDA", cudaSuccess))
|
||||
#define CUBLAS_CALL(expr) (CudaCall((expr), #expr, "CUBLAS", CUBLAS_STATUS_SUCCESS))
|
||||
#define CUSPARSE_CALL(expr) (CudaCall((expr), #expr, "CUSPARSE", CUSPARSE_STATUS_SUCCESS))
|
||||
#define CURAND_CALL(expr) (CudaCall((expr), #expr, "CURAND", CURAND_STATUS_SUCCESS))
|
||||
#define CUDNN_CALL(expr) (CudaCall((expr), #expr, "cuDNN", CUDNN_STATUS_SUCCESS))
|
||||
|
|
|
@ -4,15 +4,22 @@
|
|||
// </copyright>
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "BestGpu.h"
|
||||
|
||||
#ifndef CPUONLY
|
||||
|
||||
#include <float.h>
|
||||
#include <cuda_runtime.h>
|
||||
#pragma push_macro("TENSOR_OPS_DECL")
|
||||
#define TENSOR_OPS_DECL __device__ __host__
|
||||
#include "CommonMatrix.h"
|
||||
#include "GPUMatrix.h"
|
||||
#include "TensorOps.h" // for exp_() etc.
|
||||
#include "device_functions.h"
|
||||
#include <cuda_runtime.h>
|
||||
#include <assert.h>
|
||||
#include <float.h>
|
||||
#pragma pop_macro("TENSOR_OPS_DECL")
|
||||
|
||||
// REVIEW alexeyk: disable warnings properly for GCC/clang
|
||||
#ifdef _MSC_VER
|
||||
|
@ -36,38 +43,116 @@
|
|||
|
||||
#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
|
||||
|
||||
// CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
|
||||
static __inline__ __device__ double atomicAdd(double* address, double val)
|
||||
{
|
||||
unsigned long long int* address_as_ull = (unsigned long long int*)address;
|
||||
unsigned long long int old = *address_as_ull, assumed;
|
||||
do {
|
||||
assumed = old;
|
||||
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
|
||||
} while (assumed != old);
|
||||
return __longlong_as_double(old);
|
||||
}
|
||||
|
||||
// TODO: replace this with TensorOps.h LogAdd(). It differs in using ElemType throughout, while this one seems to use 'double' versions of exp() and log().
|
||||
// The 'k' in the name is to avoid naming conflicts with various versions of logadd() that are defined throughout the codebase.
|
||||
template<class ElemType>
|
||||
static inline __device__ __host__ ElemType logaddk(ElemType x, ElemType y)
|
||||
{
|
||||
ElemType temp, diff, z;
|
||||
|
||||
if (x < y)
|
||||
{
|
||||
temp = x; x = y; y = temp;
|
||||
}
|
||||
diff = y - x;
|
||||
if (diff < MINLOGEXP)
|
||||
{
|
||||
return (x < LSMALL) ? LZERO : x;
|
||||
}
|
||||
else
|
||||
{
|
||||
z = exp(diff);
|
||||
return x + log(1.0 + z);
|
||||
}
|
||||
}
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// GridDim -- helper to choose the CUDA grid dimensions
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// TODO: move the computation of 'id' here as well
|
||||
template<class INT, class INT2>
|
||||
static INT CeilDiv(INT a, INT2 b) // ceil(a/b)
|
||||
{
|
||||
return (INT)(((size_t)a + (size_t)b - 1) / (size_t)b); // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[])
|
||||
}
|
||||
|
||||
struct GridDim
|
||||
{
|
||||
static const CUDA_LONG maxThreadsPerBlock = 512; // use this many threads per block
|
||||
static const CUDA_LONG minBlocksPerGrid = 48; // use at least that many blocks --TODO: base this on actual hardware
|
||||
static const CUDA_LONG maxWarpsPerBlock = 16; // use this many warps per block
|
||||
|
||||
// use these for launching
|
||||
// GridDim grid(NN);
|
||||
// kernel<<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, ...>>>(...)
|
||||
int m_blocksPerGrid, m_threadsPerBlock; // (these may in the future be extended to multi-dimensional ones)
|
||||
CUDA_LONG m_N;
|
||||
|
||||
GridDim(CUDA_LONG N) // linear grid
|
||||
{
|
||||
m_N = N;
|
||||
if (N == 0) // CUDA will fail to launch with 0 blocks
|
||||
N = 1;
|
||||
m_threadsPerBlock = GridDim::maxThreadsPerBlock;
|
||||
m_blocksPerGrid = (N + m_threadsPerBlock - 1) / m_threadsPerBlock;
|
||||
if (m_blocksPerGrid < minBlocksPerGrid)
|
||||
|
||||
// get device information
|
||||
const auto & props = GetDeviceProps();
|
||||
CUDA_LONG numProcs = props.multiProcessorCount;
|
||||
CUDA_LONG warpSize = props.warpSize;
|
||||
|
||||
// distribute warps evenly over processors
|
||||
CUDA_LONG warpsPerProc = CeilDiv(N, numProcs * warpSize);
|
||||
|
||||
// if too many warps per block then reduce #warps
|
||||
if (warpsPerProc > maxWarpsPerBlock)
|
||||
{
|
||||
// we cannot fill all blocks -> use less threads
|
||||
m_threadsPerBlock = (N + minBlocksPerGrid - 1) / minBlocksPerGrid;
|
||||
// round to multiples of 32 (warp size) for efficient memory access
|
||||
m_threadsPerBlock = (m_threadsPerBlock + 31) / 32 * 32;
|
||||
m_blocksPerGrid = (N + m_threadsPerBlock - 1) / m_threadsPerBlock;
|
||||
CUDA_LONG overBy = CeilDiv(warpsPerProc, maxWarpsPerBlock); // we are over by this factor
|
||||
warpsPerProc = CeilDiv(warpsPerProc, overBy);
|
||||
}
|
||||
|
||||
// put it back together
|
||||
m_threadsPerBlock = warpsPerProc * warpSize;
|
||||
m_blocksPerGrid = CeilDiv(N, m_threadsPerBlock);
|
||||
if (m_blocksPerGrid == 1)
|
||||
m_threadsPerBlock = N; // don't launch more than necessary --TODO: Does this make a difference at all?
|
||||
assert(m_blocksPerGrid * m_threadsPerBlock >= N);
|
||||
}
|
||||
|
||||
static std::vector<cudaDeviceProp> CacheDeviceProps()
|
||||
{
|
||||
int numDevices;
|
||||
CUDA_CALL(cudaGetDeviceCount(&numDevices));
|
||||
std::vector<cudaDeviceProp> props(numDevices);
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
CUDA_CALL(cudaGetDeviceProperties(&props[i], i));
|
||||
#if 1 // on Linux, maxGridSize[0] gets reported as 0
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
fprintf(stderr, "%d procs %d warps %d %d %d max grid on %s\n", (int)props[i].multiProcessorCount, (int)props[i].warpSize, (int)props[i].maxGridSize[0], (int)props[i].maxGridSize[1], (int)props[i].maxGridSize[2], props[i].name);
|
||||
#endif
|
||||
return props;
|
||||
}
|
||||
|
||||
// get device properties of current device
|
||||
static const cudaDeviceProp & GetDeviceProps()
|
||||
{
|
||||
static std::vector<cudaDeviceProp> props = CacheDeviceProps(); // thread-safe according to C++ standard
|
||||
int deviceId;
|
||||
cudaGetDevice(&deviceId);
|
||||
return props[deviceId];
|
||||
}
|
||||
|
||||
// compute our location on the grid
|
||||
static __device__ CUDA_LONG GetLinearThreadId()
|
||||
{
|
||||
|
@ -83,9 +168,6 @@ struct GridDim
|
|||
#define UNUSED_FUNCTION_ATTRIBUTE
|
||||
#endif
|
||||
|
||||
// Predefine this for later.
|
||||
static __inline__ __device__ double atomicAdd(double* address, double val) UNUSED_FUNCTION_ATTRIBUTE;
|
||||
|
||||
// ===========================================================================
|
||||
// CUDA kernels follow, lots of them
|
||||
// ===========================================================================
|
||||
|
@ -97,18 +179,6 @@ static __inline__ __device__ double atomicAdd(double* address, double val) UNUSE
|
|||
// (ElemenType *res, CUDA_LONG N), a pointer and length of the output block. Each thread computes a function
|
||||
// of the inputs for one value in the output.
|
||||
|
||||
// This macro overloads _x() with float and double arguments, and inlines the correct library function. This simplifies templated kernel code.
|
||||
// TODO: merge with similar definition in TensorOps.h
|
||||
#define DEF_ELEMENT_PRIMITIVE(x) __device__ __forceinline__ float _##x(float f) { return x##f(f); } __device__ __forceinline__ double _##x(double f) { return x(f); }
|
||||
|
||||
DEF_ELEMENT_PRIMITIVE(exp)
|
||||
DEF_ELEMENT_PRIMITIVE(log)
|
||||
DEF_ELEMENT_PRIMITIVE(tanh)
|
||||
DEF_ELEMENT_PRIMITIVE(sqrt)
|
||||
DEF_ELEMENT_PRIMITIVE(fabs)
|
||||
DEF_ELEMENT_PRIMITIVE(cos)
|
||||
DEF_ELEMENT_PRIMITIVE(sin)
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _elementWisePowerOnCuda(
|
||||
const ElemType alpha,
|
||||
|
@ -147,6 +217,7 @@ __global__ void _elementWisePowerOnCuda(
|
|||
};
|
||||
|
||||
// Note that this code is inefficient on CUDA due to diverging code paths.
|
||||
// Use Sigmoid() in TensorOps.h instead, which solves this problem.
|
||||
template<class ElemType>
|
||||
__global__ void _elementWiseSigmoidOnCuda(
|
||||
const ElemType *a,
|
||||
|
@ -159,12 +230,12 @@ __global__ void _elementWiseSigmoidOnCuda(
|
|||
#else
|
||||
if (a[id] >= 0)
|
||||
{
|
||||
ElemType e = _exp(-a[id]);
|
||||
ElemType e = exp_(-a[id]);
|
||||
res[id] = 1 / (1 + e);
|
||||
}
|
||||
else
|
||||
{
|
||||
ElemType e = _exp(a[id]);
|
||||
ElemType e = exp_(a[id]);
|
||||
res[id] = e / (1 + e);
|
||||
}
|
||||
#endif
|
||||
|
@ -186,7 +257,7 @@ __global__ void _assignSigmoidOf(
|
|||
res[id] = Microsoft::MSR::CNTK::Sigmoid(a[id]);
|
||||
#else
|
||||
ElemType negElem = -a[id];
|
||||
ElemType e = _exp(negElem);
|
||||
ElemType e = exp_(negElem);
|
||||
|
||||
res[id] = 1 / (e + 1);
|
||||
#endif
|
||||
|
@ -219,7 +290,7 @@ __global__ void _elementWiseTanhOnCuda(
|
|||
const CUDA_LONG N)
|
||||
{
|
||||
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
|
||||
res[id] = _tanh(a[id]);
|
||||
res[id] = tanh_(a[id]);
|
||||
};
|
||||
|
||||
//to prevent negative values caused by floating operations, we force inputs to be >=0
|
||||
|
@ -231,7 +302,7 @@ __global__ void _elementWiseSqrtOnCuda(
|
|||
const CUDA_LONG N)
|
||||
{
|
||||
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
|
||||
res[id] = _sqrt(max((ElemType)0, a[id]));
|
||||
res[id] = sqrt_(max((ElemType)0, a[id]));
|
||||
};
|
||||
|
||||
template<class ElemType>
|
||||
|
@ -241,7 +312,7 @@ __global__ void _elementWiseExpOnCuda(
|
|||
const CUDA_LONG N)
|
||||
{
|
||||
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
|
||||
res[id] = _exp(a[id]);
|
||||
res[id] = exp_(a[id]);
|
||||
};
|
||||
|
||||
template<class ElemType>
|
||||
|
@ -251,7 +322,7 @@ __global__ void _elementWiseLogOnCuda(
|
|||
const CUDA_LONG N)
|
||||
{
|
||||
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
|
||||
res[id] = (a[id] < EPS_IN_LOG) ? LOG_OF_EPS_IN_LOG : _log(a[id]);
|
||||
res[id] = (a[id] < EPS_IN_LOG) ? LOG_OF_EPS_IN_LOG : log_(a[id]);
|
||||
};
|
||||
|
||||
template<class ElemType>
|
||||
|
@ -261,7 +332,7 @@ __global__ void _elementWiseAbsOnCuda(
|
|||
const CUDA_LONG N)
|
||||
{
|
||||
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
|
||||
res[id] = _fabs(a[id]);
|
||||
res[id] = fabs_(a[id]);
|
||||
};
|
||||
|
||||
template<class ElemType>
|
||||
|
@ -271,7 +342,7 @@ __global__ void _elementWiseCosineOnCuda(
|
|||
const CUDA_LONG N)
|
||||
{
|
||||
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
|
||||
res[id] = _cos(a[id]);
|
||||
res[id] = cos_(a[id]);
|
||||
};
|
||||
|
||||
template<class ElemType>
|
||||
|
@ -281,7 +352,7 @@ __global__ void _elementWiseNegativeSineOnCuda(
|
|||
const CUDA_LONG N)
|
||||
{
|
||||
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
|
||||
res[id] = -_sin(a[id]);
|
||||
res[id] = -sin_(a[id]);
|
||||
};
|
||||
|
||||
template<class ElemType>
|
||||
|
@ -1210,42 +1281,60 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
|
|||
ElemType* cnzValues, //target nz values
|
||||
GPUSPARSE_INDEX_TYPE* cRowIndex,
|
||||
GPUSPARSE_INDEX_TYPE* cColCSCIndex,
|
||||
size_t D, size_t S, size_t M, size_t K, size_t T)
|
||||
size_t D, size_t S, size_t M, size_t K, size_t T,
|
||||
size_t nz)
|
||||
{
|
||||
CUDA_LONG col = blockDim.x * blockIdx.x + threadIdx.x; // input tensor of dimension (D x S x M x K x T)
|
||||
if (col >= T)
|
||||
CUDA_LONG N = blockDim.x * blockIdx.x + threadIdx.x; // input tensor of dimension (D x S x M x K x T)
|
||||
if (N >= nz || N < aColCSCIndex[0])
|
||||
return;
|
||||
|
||||
size_t N = D * S * M * K;
|
||||
size_t col;
|
||||
for (col = 0; col < T; col++)
|
||||
{
|
||||
if (aColCSCIndex[col + 1] > N)
|
||||
break;
|
||||
}
|
||||
|
||||
size_t na = aRowIndex[N];
|
||||
int start = aColCSCIndex[col];
|
||||
int end = aColCSCIndex[col + 1];
|
||||
int current = start;
|
||||
|
||||
for (size_t nc = 0; nc < N; nc++)
|
||||
// recover the 5 indices from the loop counter
|
||||
size_t d = (na ) % D;
|
||||
size_t s = (na / D ) % S;
|
||||
size_t m = (na / D / S ) % M;
|
||||
size_t k = (na / D / S / M ) % K;
|
||||
|
||||
// compute index for the a and b/c tensors
|
||||
size_t nc = ((s * M + m) * K + k) * D + d; // output tensor of dimension (D x K x M x S): k/K and s/S swapped
|
||||
|
||||
int rowIdx = start;
|
||||
for (size_t na_i = start; na_i < end; na_i++)
|
||||
{
|
||||
// recover the 5 indices from the loop counter
|
||||
size_t d = (nc ) % D;
|
||||
size_t s = (nc / D ) % S;
|
||||
size_t m = (nc / D / S ) % M;
|
||||
size_t k = (nc / D / S / M ) % K;
|
||||
size_t d_i = (na_i ) % D;
|
||||
size_t s_i = (na_i / D ) % S;
|
||||
size_t m_i = (na_i / D / S ) % M;
|
||||
size_t k_i = (na_i / D / S / M ) % K;
|
||||
|
||||
// compute index for the a and b/c tensors
|
||||
size_t na = ((s * M + m) * K + k) * D + d; // output tensor of dimension (D x K x M x S): k/K and s/S swapped
|
||||
|
||||
for (size_t j = start; j < end; j++)
|
||||
size_t nc_i = ((s_i * M + m_i) * K + k_i) * D + d_i; // output tensor of dimension (D x K x M x S): k/K and s/S swapped
|
||||
if (nc_i < nc)
|
||||
{
|
||||
if (aRowIndex[j] == na)
|
||||
{
|
||||
cnzValues[current] = anzValues[j];
|
||||
cRowIndex[current] = nc;
|
||||
current++;
|
||||
break;
|
||||
}
|
||||
rowIdx++;
|
||||
}
|
||||
}
|
||||
|
||||
cColCSCIndex[col] = start;
|
||||
cColCSCIndex[col + 1] = end;
|
||||
cnzValues[rowIdx] = anzValues[N];
|
||||
cRowIndex[rowIdx] = nc;
|
||||
|
||||
if (N == nz - 1)
|
||||
{
|
||||
for (int i = 0; i <= T; i++)
|
||||
{
|
||||
cColCSCIndex[i] = aColCSCIndex[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
|
@ -2688,25 +2777,82 @@ __global__ void _sparseCSRElemMulDense(
|
|||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _isValid(
|
||||
const GPUSPARSE_INDEX_TYPE* rowIndex,
|
||||
const GPUSPARSE_INDEX_TYPE* colCSCIndex,
|
||||
const int rows,
|
||||
const int cols,
|
||||
const int nz,
|
||||
long* d_res
|
||||
)
|
||||
{
|
||||
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (id >= cols)
|
||||
return;
|
||||
|
||||
int start = colCSCIndex[id];
|
||||
int end = colCSCIndex[id + 1];
|
||||
d_res[0] = 1;
|
||||
|
||||
if (start > end)
|
||||
{
|
||||
d_res[0] = -1;
|
||||
d_res[1] = start;
|
||||
d_res[2] = end;
|
||||
}
|
||||
else if (end > nz)
|
||||
{
|
||||
d_res[0] = -2;
|
||||
d_res[1] = end;
|
||||
d_res[2] = nz;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = start; j < end; j++) //j points to the value
|
||||
{
|
||||
if (rowIndex[j] > rows)
|
||||
{
|
||||
d_res[0] = -3;
|
||||
d_res[1] = rowIndex[j];
|
||||
d_res[2] = rows;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _shiftColCSCIndexFromSliceViewToAbsolute(
|
||||
GPUSPARSE_INDEX_TYPE* colCSCIndex,
|
||||
const int cols
|
||||
)
|
||||
{
|
||||
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (id >= cols)
|
||||
return;
|
||||
|
||||
colCSCIndex[id] = colCSCIndex[id] - colCSCIndex[0];
|
||||
}
|
||||
|
||||
//c = alpha * op(a) * op(b) + beta*c
|
||||
// TODO: This function can be further improved by loading the kernel in shared memory
|
||||
template<class ElemType>
|
||||
__global__ void _dense1DConvMultSparseCSCAndWeightedAddToDense(
|
||||
int m, // rowDense
|
||||
int k, // colDense
|
||||
int n, // colSparse
|
||||
int numChannels, // input num channels
|
||||
int numSteps, // convolution num steps
|
||||
int horizontalSubsample,// convolution step size
|
||||
bool channelwise, // pixelwise for normal multiplication and channelwise for convolution operation
|
||||
ElemType alpha,
|
||||
const int m, // rowDense
|
||||
const int k, // colDense
|
||||
const int n, // colSparse
|
||||
const int numChannels, // input num channels
|
||||
const int numSteps, // convolution num steps
|
||||
const int horizontalSubsample,// convolution step size
|
||||
const bool channelwise, // pixelwise for normal multiplication and channelwise for convolution operation
|
||||
const ElemType alpha,
|
||||
const ElemType* a, //dense
|
||||
bool transposeA,
|
||||
const bool transposeA,
|
||||
const ElemType* bnzValues, //sparse nz values
|
||||
const GPUSPARSE_INDEX_TYPE* rowIndex,
|
||||
const GPUSPARSE_INDEX_TYPE* colCSCIndex,
|
||||
ElemType beta,
|
||||
const ElemType beta,
|
||||
ElemType* c //dense target
|
||||
)
|
||||
{
|
||||
|
@ -2828,15 +2974,15 @@ __global__ void _reshape(
|
|||
|
||||
int currentCol = id;
|
||||
int oldColLower = (newNumRows * currentCol) / oldNumRows;
|
||||
int oldColUpper = (newNumRows * (currentCol + 1)) / oldNumRows;
|
||||
|
||||
// initialize to the end and then scan in the right direction in the for-loop
|
||||
int currentColStart = oldColumnIndex[oldNumCols];
|
||||
|
||||
for (int oldCol = oldColLower; oldCol <= min(oldColUpper, oldNumCols); oldCol++)
|
||||
for (int oldCol = oldColLower; oldCol <= oldNumCols; oldCol++)
|
||||
{
|
||||
int start = oldColumnIndex[oldCol];
|
||||
int end = (oldCol < oldNumCols) ? oldColumnIndex[oldCol + 1] : oldColumnIndex[oldNumCols] + 1;
|
||||
bool done = false;
|
||||
|
||||
for (int j = start; j < end; j++) //j points to the value
|
||||
{
|
||||
|
@ -2845,11 +2991,21 @@ __global__ void _reshape(
|
|||
int newCol = index / newNumRows;
|
||||
int newRow = index % newNumRows;
|
||||
|
||||
newRowIndex[j] = newRow;
|
||||
if (newCol == currentCol)
|
||||
newRowIndex[j] = newRow;
|
||||
|
||||
if (newCol >= currentCol && currentColStart > j)
|
||||
currentColStart = j;
|
||||
|
||||
if (newCol > currentCol)
|
||||
{
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (done)
|
||||
break;
|
||||
}
|
||||
|
||||
newColumnIndex[currentCol] = currentColStart;
|
||||
|
@ -3423,7 +3579,7 @@ __global__ void _assignNoiseContrastiveEstimation(
|
|||
if (positive)
|
||||
prob = -prob;
|
||||
ElemType score_noise = log_num_noise_samples + prob;
|
||||
ElemType z = logadd(tmp[i], score_noise);
|
||||
ElemType z = logaddk(tmp[i], score_noise);
|
||||
ElemType logprob = tmp[i] - z;
|
||||
ElemType logprob_noise = score_noise - z;
|
||||
tmp[i] = -exp(logprob);
|
||||
|
@ -3715,40 +3871,6 @@ __global__ void _normalGradForSparseBlock(
|
|||
lhsValues[index] = rhs[IDX2C(row, col, numRows)];
|
||||
}
|
||||
|
||||
static __inline__ __device__ double atomicAdd(double* address, double val)
|
||||
{
|
||||
unsigned long long int* address_as_ull = (unsigned long long int*)address;
|
||||
unsigned long long int old = *address_as_ull, assumed;
|
||||
|
||||
do {
|
||||
assumed = old;
|
||||
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
|
||||
} while (assumed != old);
|
||||
|
||||
return __longlong_as_double(old);
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
static __inline__ __device__ ElemType logadd(ElemType x, ElemType y)
|
||||
{
|
||||
ElemType temp, diff, z;
|
||||
|
||||
if (x < y)
|
||||
{
|
||||
temp = x; x = y; y = temp;
|
||||
}
|
||||
diff = y - x;
|
||||
if (diff < MINLOGEXP)
|
||||
{
|
||||
return (x < LSMALL)?LZERO:x;
|
||||
}
|
||||
else
|
||||
{
|
||||
z = exp(diff);
|
||||
return x + log(1.0 + z);
|
||||
}
|
||||
}
|
||||
|
||||
//This function should be called with 1024 threads per block and 1 block
|
||||
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
|
||||
template<class ElemType>
|
||||
|
@ -4513,7 +4635,7 @@ __global__ void _rcrfBackwardCompute(
|
|||
fSum = LZERO;
|
||||
for (int j = 0; j < iNumLab; j++)
|
||||
{
|
||||
fSum = logadd(fSum, alpha[IDX2C(j, t, iNumLab)]);
|
||||
fSum = logaddk(fSum, alpha[IDX2C(j, t, iNumLab)]);
|
||||
}
|
||||
|
||||
fTmp = alpha[IDX2C(id, t, iNumLab)] - fSum;
|
||||
|
@ -4525,10 +4647,10 @@ __global__ void _rcrfBackwardCompute(
|
|||
fSum = LZERO;
|
||||
for (int m = 0; m < iNumLab; m++)
|
||||
{
|
||||
fSum = logadd(fSum, alpha[IDX2C(m, t, iNumLab)] + pair_scores[IDX2C(j, m, iNumLab)]);
|
||||
fSum = logaddk(fSum, alpha[IDX2C(m, t, iNumLab)] + pair_scores[IDX2C(j, m, iNumLab)]);
|
||||
}
|
||||
|
||||
fTmp = logadd(fTmp, beta[IDX2C(j, t + 1, iNumLab)] + alpha[IDX2C(id, t, iNumLab)] + pair_scores[IDX2C(j, id, iNumLab)] - fSum);
|
||||
fTmp = logaddk(fTmp, beta[IDX2C(j, t + 1, iNumLab)] + alpha[IDX2C(id, t, iNumLab)] + pair_scores[IDX2C(j, id, iNumLab)] - fSum);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4589,7 +4711,7 @@ __global__ void _rcrfBackwardCompute(
|
|||
{
|
||||
for (int j = 0; j < iNumLab; j++)
|
||||
{
|
||||
fTmp = logadd(fTmp, beta_t1[j] + alpha[id] + pair_scores[j] - zeta[j]);
|
||||
fTmp = logaddk(fTmp, beta_t1[j] + alpha[id] + pair_scores[j] - zeta[j]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4630,9 +4752,9 @@ __global__ void _rcrfBackwardComputeZeta(
|
|||
for (int m = 0; m < iNumLab; m++)
|
||||
{
|
||||
if (t == iNumPos - 1)
|
||||
fSum = logadd(fSum, alpha[IDX2C(m, 0, iNumLab)]);
|
||||
fSum = logaddk(fSum, alpha[IDX2C(m, 0, iNumLab)]);
|
||||
else
|
||||
fSum = logadd(fSum, alpha[IDX2C(m, 0, iNumLab)] + pair_scores[m]);
|
||||
fSum = logaddk(fSum, alpha[IDX2C(m, 0, iNumLab)] + pair_scores[m]);
|
||||
}
|
||||
|
||||
gzeta[id] = fSum;
|
||||
|
@ -4684,7 +4806,7 @@ __global__ void _rcrfTransGrdComputeZeta(
|
|||
else
|
||||
fTmp = alpha[m];
|
||||
|
||||
fSum = logadd(fSum, pair_scores[m] + fTmp);
|
||||
fSum = logaddk(fSum, pair_scores[m] + fTmp);
|
||||
}
|
||||
|
||||
gzeta[id] = fSum;
|
||||
|
@ -4787,7 +4909,7 @@ __global__ void _reductionLogAddSum(
|
|||
{
|
||||
ElemType lSum = LZERO;
|
||||
if (tid < s){
|
||||
lSum = logadd(partialLogAddSum[tid], partialLogAddSum[tid + s]);
|
||||
lSum = logaddk(partialLogAddSum[tid], partialLogAddSum[tid + s]);
|
||||
partialLogAddSum[tid] = lSum;
|
||||
}
|
||||
}
|
||||
|
@ -4912,4 +5034,6 @@ __global__ void _maskColumnsValue(ElemType *a, const char *columnsMask, CUDA_LON
|
|||
}
|
||||
}
|
||||
|
||||
}}}
|
||||
|
||||
#endif // !CPUONLY
|
||||
|
|
|
@ -34,11 +34,7 @@ static
|
|||
#endif
|
||||
cudaStream_t t_stream;
|
||||
|
||||
|
||||
// support for CudaCall() function template
|
||||
static const char * CudaErrString(cudaError_t x) { cudaDeviceSynchronize(); return cudaGetErrorString(x); }
|
||||
static const char * CudaErrString(cublasStatus_t) { cudaDeviceSynchronize(); return "(see cublas_api.h & look for cublasStatus_t or CUBLAS_STATUS_xxx)"; }
|
||||
static const char * CudaErrString(cusparseStatus_t) { cudaDeviceSynchronize(); return "(see cusparse.h & look for cusparseStatus_t or CUSPARSE_STATUS_xxx)"; }
|
||||
template<> const char * CudaErrString<cusparseStatus_t>(cusparseStatus_t) { cudaDeviceSynchronize(); return "(see cusparse.h & look for cusparseStatus_t or CUSPARSE_STATUS_xxx)"; }
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -137,14 +133,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ChangeDeviceTo(deepCopy.m_computeDevice);
|
||||
deepCopy.PrepareDevice();
|
||||
|
||||
Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.m_elemSizeAllocated, deepCopy.m_format, true, false);
|
||||
Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.GetNumNZElements(), deepCopy.m_format, true, false);
|
||||
m_nz = deepCopy.m_nz;
|
||||
m_sliceViewOffset = 0; // reset to zero as we only start copying starting from the offset in the source matrix
|
||||
m_sliceViewOffset = 0; // reset to zero as we only start copying the indices starting from the offset in the source matrix
|
||||
|
||||
CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToDevice));
|
||||
CUDA_CALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
|
||||
CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
|
||||
CUDA_CALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocationWithSliceViewOffset(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
|
||||
CUDA_CALL(cudaMemcpy(SecondaryIndexLocation(), deepCopy.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));
|
||||
|
||||
if (deepCopy.m_sliceViewOffset > 0)
|
||||
{
|
||||
int blocksPerGrid = (int)ceil(1.0*SecondaryIndexCount() / GridDim::maxThreadsPerBlock);
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_shiftColCSCIndexFromSliceViewToAbsolute<ElemType> << < blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream >> > (
|
||||
SecondaryIndexLocation(),
|
||||
SecondaryIndexCount()
|
||||
);
|
||||
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
}
|
||||
|
||||
m_externalBuffer = false;
|
||||
SetMatrixName(deepCopy.m_matrixName);
|
||||
|
||||
|
@ -1002,7 +1013,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
template<class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA,
|
||||
const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
|
||||
const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
|
||||
{
|
||||
if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId()))
|
||||
RuntimeError("GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd: All matrices must be on the same GPU");
|
||||
|
@ -1133,7 +1144,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
c.PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
CUDA_LONG N = (CUDA_LONG)c.GetNumCols();
|
||||
CUDA_LONG N = (CUDA_LONG)c.GetNumNZElements();
|
||||
int blocksPerGrid = (int)ceil(1.0*N / GridDim::maxThreadsPerBlock);
|
||||
_tensorShuffleScaleAndAddRowSparse<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream >> >(
|
||||
reinterpret_cast<const ElemType*>(a.BufferPointer()), // source nz values
|
||||
|
@ -1142,7 +1153,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
reinterpret_cast<ElemType*>(c.BufferPointer()), // target nz values
|
||||
c.RowLocation(),
|
||||
c.ColLocation(),
|
||||
D, S, M, K, T);
|
||||
D, S, M, K, T,
|
||||
c.GetNumNZElements());
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1936,6 +1948,37 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return GPUSparseMatrix<ElemType>::InnerProductOfMatrices(b,a);
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
bool GPUSparseMatrix<ElemType>::IsValid() const
|
||||
{
|
||||
if (m_format != MatrixFormat::matrixFormatSparseCSC)
|
||||
NOT_IMPLEMENTED;
|
||||
|
||||
PrepareDevice();
|
||||
long *res = new long[3];
|
||||
res[0] = 1;
|
||||
res[1] = 0;
|
||||
res[2] = 0;
|
||||
long *d_res = nullptr;
|
||||
CUDA_CALL(cudaMalloc((void**)&d_res, sizeof(long) * 3));
|
||||
CUDA_CALL(cudaMemcpy(d_res, res, sizeof(long) * 3, cudaMemcpyHostToDevice));
|
||||
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
int blocksPerGrid = (int)ceil((1.0*SecondaryIndexSize()) / GridDim::maxThreadsPerBlock);
|
||||
_isValid<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(MajorIndexLocation(), SecondaryIndexLocation(), GetNumRows(), GetNumCols(), GetNumElemAllocated(), d_res);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
||||
CUDA_CALL(cudaMemcpy(res, d_res, sizeof(long) * 3, cudaMemcpyDeviceToHost));
|
||||
|
||||
if (res[0] == 1)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
|
||||
const ElemType threshold)
|
||||
|
|
|
@ -73,18 +73,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// Special Note: for the matrix may be a read-only column slice view of another
|
||||
// matrix (only supported for CSC format today) and hence the NzValues needs
|
||||
// to be offset accordingly.
|
||||
inline const ElemType* NzValues() const { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(m_sliceViewOffset); }
|
||||
inline ElemType* NzValues() { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(m_sliceViewOffset); }
|
||||
inline const ElemType* NzValues() const { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(0); }
|
||||
inline ElemType* NzValues() { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(0); }
|
||||
inline size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use
|
||||
|
||||
GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const //row/col ids in CSC/CSR format, blockId2col/blockId2row in BlockCol/BlockRow format
|
||||
{
|
||||
return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated);
|
||||
}
|
||||
}
|
||||
|
||||
GPUSPARSE_INDEX_TYPE* MajorIndexLocationWithSliceViewOffset() const
|
||||
{
|
||||
return (MajorIndexLocation() + (m_format == matrixFormatSparseCSC ? SecondaryIndexValueAt(0) : 0));
|
||||
}
|
||||
|
||||
size_t MajorIndexCount() const
|
||||
{
|
||||
return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format);
|
||||
return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format);
|
||||
}
|
||||
size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const
|
||||
{
|
||||
|
@ -98,7 +103,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t MajorIndexSize() const // actual number of major index bytes in use
|
||||
{
|
||||
return sizeof(GPUSPARSE_INDEX_TYPE)*MajorIndexCount();
|
||||
}
|
||||
}
|
||||
|
||||
GPUSPARSE_INDEX_TYPE* SecondaryIndexLocation() const //compressed index, col/row in CSC/CSR format, col2blockId/row2blockId in BlockCol/BlockRow format
|
||||
{
|
||||
|
@ -239,6 +244,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void ConvertToSparseFormat(MatrixFormat newFormat);
|
||||
void ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const;
|
||||
|
||||
bool IsValid() const;
|
||||
|
||||
public:
|
||||
GPUSparseMatrix<ElemType>& ElementInverse ();
|
||||
GPUSparseMatrix<ElemType>& AssignElementInverseOf (const GPUSparseMatrix<ElemType>& a);
|
||||
|
@ -290,7 +297,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
const bool transposeB, GPUSparseMatrix<ElemType>& c);
|
||||
static void ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& c);
|
||||
static void ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs,
|
||||
const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise);
|
||||
const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise);
|
||||
static void TensorShuffleScaleAndAdd(ElemType keepWeight, const GPUSparseMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c);
|
||||
|
||||
void NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum);
|
||||
|
|
|
@ -0,0 +1,693 @@
|
|||
//
|
||||
// <copyright file="GPUMatrix.cu" company="Microsoft">
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// </copyright>
|
||||
//
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "Basics.h"
|
||||
#include "BestGpu.h"
|
||||
|
||||
#ifndef CPUONLY
|
||||
|
||||
#include "GPUTensor.h"
|
||||
#include "GPUMatrix.h"
|
||||
#include "GPUMatrixCUDAKernels.cuh"
|
||||
#include "CommonMatrix.h"
|
||||
#define TENSOR_OPS_DECL __device__ __host__
|
||||
#include "TensorOps.h"
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include "cublas_v2.h"
|
||||
#include <assert.h>
|
||||
|
||||
#ifndef let
|
||||
#define let const auto
|
||||
#endif
|
||||
|
||||
#pragma comment (lib, "cudart.lib") // instruct linker to reference these libs
|
||||
#pragma comment (lib, "cublas.lib")
|
||||
|
||||
#pragma warning (disable: 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<<a,b>>> syntax if a and b are size_t
|
||||
#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
|
||||
#pragma warning (disable: 4702) // unreachable code; triggered for unknown reasons
|
||||
|
||||
extern bool do_sync;
|
||||
|
||||
#ifdef _WIN32
|
||||
// thread local storage to access the current stream, initalize to default stream
|
||||
__declspec (thread)
|
||||
#endif
|
||||
extern cudaStream_t t_stream;
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// =======================================================================
|
||||
// TensorView support
|
||||
// =======================================================================
|
||||
|
||||
// To save time, this makes extensive use of templates and macros.
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// simple fixed-size arrays for passing dimension information by value
|
||||
// since CUDA can't just take our std::array and std::vector
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<typename T, size_t N>
|
||||
struct FixedArray
|
||||
{
|
||||
T m_data[N];
|
||||
__device__ __host__ size_t size() const { return N; }
|
||||
__device__ __host__ T & operator[](size_t n) { return m_data[n]; }
|
||||
__device__ __host__ T operator[](size_t n) const { return m_data[n]; }
|
||||
template<class VEC> FixedArray(const VEC & data) // construct from CPU-side STL array or vector
|
||||
{
|
||||
assert(data.size() == N);
|
||||
for (size_t n = 0; n < N; n++)
|
||||
{
|
||||
m_data[n] = (T)data[n];
|
||||
if (m_data[n] != data[n]) // overflow check
|
||||
InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
|
||||
}
|
||||
}
|
||||
};
|
||||
template<typename T> // specialized version for 0 elements
|
||||
struct FixedArray<T, 0>
|
||||
{
|
||||
__device__ __host__ size_t size() const { return 0; }
|
||||
template<class VEC> FixedArray(const VEC & data) { assert(data.size() == 0); UNUSED(data); }
|
||||
FixedArray() { }
|
||||
};
|
||||
|
||||
template<typename T, size_t N, size_t K> // N = which input/output; K = index depth
|
||||
struct FixedMatrix
|
||||
{
|
||||
T m_data[N][K];
|
||||
__device__ __host__ size_t getNumRows() const { return N; }
|
||||
__device__ __host__ size_t getNumCols() const { return K; }
|
||||
__device__ __host__ T & operator()(size_t n, size_t k) { return m_data[n][k]; }
|
||||
__device__ __host__ T operator()(size_t n, size_t k) const { return m_data[n][k]; }
|
||||
template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data) // construct from CPU-side array of vectors
|
||||
{
|
||||
assert(data.size() == N);
|
||||
for (size_t n = 0; n < N; n++)
|
||||
{
|
||||
assert(data[n].size() == K);
|
||||
for (size_t k = 0; k < K; k++)
|
||||
{
|
||||
m_data[n][k] = (T)data[n][k];
|
||||
if (m_data[n][k] != data[n][k]) // overflow check
|
||||
InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
template<typename T, size_t N> // specialized version for 0 elements
|
||||
struct FixedMatrix<T, N, 0>
|
||||
{
|
||||
__device__ __host__ size_t getNumRows() const { return N; }
|
||||
__device__ __host__ size_t getNumCols() const { return 0; }
|
||||
template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data) { assert(data.size() == N); for (size_t n = 0; n < N; n++) assert(data[n].size() == 0); UNUSED(data); }
|
||||
FixedMatrix() { }
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// function to actually compute a function of (N-1) inputs based on the opcode
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType>
|
||||
struct TensorOps
|
||||
{
|
||||
static __device__ ElemType Compute(const FixedArray<ElemType*, 1> & pointers, ElementWiseOperator op)
|
||||
{
|
||||
#define CaseNullaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper<ElemType>()
|
||||
switch (op)
|
||||
{
|
||||
ForAllNullaryOps(CaseNullaryTensorOp);
|
||||
default: return OpConstOne<ElemType>(); // (failure--we only have one nullary op, so use the same, maybe it will eliminate the switch altogether)
|
||||
}
|
||||
}
|
||||
static __device__ ElemType Compute(const FixedArray<ElemType*, 2> & pointers, ElementWiseOperator op)
|
||||
{
|
||||
ElemType a = *(pointers[0]);
|
||||
#define CaseUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a)
|
||||
switch (op)
|
||||
{
|
||||
ForAllUnaryOps(CaseUnaryTensorOp);
|
||||
default: return 0; // (failure)
|
||||
}
|
||||
}
|
||||
static __device__ ElemType Compute(const FixedArray<ElemType*, 3> & pointers, ElementWiseOperator op)
|
||||
{
|
||||
//const ElemType & a = *(pointers[0]); // const & for opIndex--costs quite some code bloat
|
||||
ElemType a = *(pointers[0]);
|
||||
ElemType b = *(pointers[1]);
|
||||
#define CaseBinaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b)
|
||||
switch (op)
|
||||
{
|
||||
ForAllBinaryOps(CaseBinaryTensorOp); // note: this costs about 6% compared to having only a single case
|
||||
default: return 0; // (failure)
|
||||
}
|
||||
}
|
||||
static __device__ ElemType Compute(const FixedArray<ElemType*, 4> & pointers, ElementWiseOperator op)
|
||||
{
|
||||
ElemType a = *(pointers[0]);
|
||||
ElemType b = *(pointers[1]);
|
||||
ElemType c = *(pointers[2]);
|
||||
#define CaseTernaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b,c)
|
||||
switch (op)
|
||||
{
|
||||
ForAllTernaryOps(CaseTernaryTensorOp);
|
||||
default: return 0; // (failure)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// function to compute the value for a given output location (this version performs reduction if needed)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
//#define ReduceElemType double
|
||||
#define ReduceElemType ElemType
|
||||
|
||||
template<class ElemType, C_size_t N, C_int M, C_int m>
|
||||
struct TensorOpReduce
|
||||
{
|
||||
// this version for m >= 0
|
||||
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
|
||||
{
|
||||
// start with index 0
|
||||
// We may use 'double' since we are memory-bound anyway.
|
||||
ReduceElemType aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
|
||||
// apply this index to the pointers
|
||||
C_size_t dim = reducingOpDims[m];
|
||||
for (C_size_t k = 1/*done with k=0 already*/; k < dim; k++)
|
||||
{
|
||||
// bump the pointers
|
||||
for (C_size_t i = 0; i < N - 1; i++) // N-1 because output is not used here
|
||||
pointers[i] += reducingStrides(i,(C_size_t)m);
|
||||
ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
|
||||
aggregate += val;
|
||||
}
|
||||
return (ElemType)aggregate;
|
||||
}
|
||||
};
|
||||
|
||||
// this one terminates the template recursion over reduction dimensions
|
||||
// The pointers are pointing to the input element.
|
||||
template<class ElemType, C_size_t N, C_int M>
|
||||
struct TensorOpReduce<ElemType, N, M, /*m=*/-1>
|
||||
{
|
||||
// this version for m = -1
|
||||
// the pointers are pointing to the right location(s) to take the operation over
|
||||
static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, M> & /*reducingOpDims*/, const FixedMatrix<C_int, N, M> & /*reducingStrides*/)
|
||||
{
|
||||
return TensorOps<ElemType>::Compute(pointers, op); // finally computing something!
|
||||
}
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// function to compute one constituent of the value for a given output location (this version has reduction done outside)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType, C_size_t N, C_int M, C_int m>
|
||||
struct TensorOpParallelReduce
|
||||
{
|
||||
// this version for m >= 0
|
||||
static __device__ ElemType Compute(CUDA_LONG id, FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
|
||||
{
|
||||
// map id (location on grid) to index[k]
|
||||
C_size_t stride = 1; // compute the stride. This seems expensive, but since we we only currently support M <= 2, this is just compile-time selection between 1 and reducingOpDims[0].
|
||||
for (int i = 0; i < m; i++)
|
||||
stride *= reducingOpDims[(C_size_t)i];
|
||||
C_size_t index = id / stride; // this dimension. For m=0, the stride is 1 and hence the division will be removed at compile time.
|
||||
id = id % stride; // remaining dimensions inside this. For m=0 this value is ignored and hence not even computed.
|
||||
// apply this index to the pointers
|
||||
for (C_size_t i = 0; i < N - 1; i++)
|
||||
pointers[i] += index * reducingStrides(i, (C_size_t)m); // now this dimension is taken care of
|
||||
return TensorOpParallelReduce<ElemType, N, M, m - 1>::Compute(id, pointers, op, reducingOpDims, reducingStrides);
|
||||
}
|
||||
};
|
||||
|
||||
// this one terminates the template recursion over reduction dimensions
|
||||
// The pointers are pointing to the input element.
|
||||
template<class ElemType, C_size_t N, C_int M>
|
||||
struct TensorOpParallelReduce<ElemType, N, M, /*m=*/-1>
|
||||
{
|
||||
// this version for m = -1
|
||||
// the pointers are pointing to the right location(s) to take the operation over
|
||||
static __device__ ElemType Compute(CUDA_LONG /*id*/, FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, M> & /*reducingOpDims*/, const FixedMatrix<C_int, N, M> & /*reducingStrides*/)
|
||||
{
|
||||
return TensorOps<ElemType>::Compute(pointers, op); // finally computing something!
|
||||
}
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// perform loop over regular index k for N-nary operations (N counting the output)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// The canonical case, vector op without reduction, is this PTX function:
|
||||
// _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi3ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
|
||||
// float ^ ^ aggregate loop
|
||||
// args? ^ ^ input dims
|
||||
// _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi2ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
|
||||
|
||||
// The 'pointers' only refer to a single element, so we will bump them in-place to perform indexing.
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce, C_int k>
|
||||
struct TensorOpElement
|
||||
{
|
||||
// template-recursive version loops over indices
|
||||
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, K> & regularOpStrides, const FixedMatrix<C_int, N, K> & regularStrides,
|
||||
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides,
|
||||
CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
|
||||
{
|
||||
// map id (location on grid) to index[k]
|
||||
C_size_t stride = regularOpStrides[(C_size_t)k];
|
||||
C_size_t index = id / stride; // this dimension
|
||||
id = id % stride; // remaining dimensions inside this
|
||||
// apply this index to the pointers
|
||||
for (C_size_t i = 0; i < N; i++)
|
||||
pointers[i] += index * regularStrides(i,(C_size_t)k); // now this dimension is taken care of
|
||||
// process the previous index
|
||||
TensorOpElement<ElemType, N, M, K, parallelReduce, k - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
|
||||
}
|
||||
};
|
||||
|
||||
// specialization for k=0 where op stride is guaranteed to be 1
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce>
|
||||
struct TensorOpElement<ElemType, N, M, K, parallelReduce, /*k=*/0>
|
||||
{
|
||||
// template-recursive version loops over indices
|
||||
static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, K> & regularOpStrides, const FixedMatrix<C_int, N, K> & regularStrides,
|
||||
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides,
|
||||
CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
|
||||
{
|
||||
// map id (location on grid) to index[k]
|
||||
C_size_t index = id; // this dimension
|
||||
// apply this index to the pointers
|
||||
for (C_size_t i = 0; i < N; i++)
|
||||
pointers[i] += index * regularStrides(i,0); // now this dimension is taken care of
|
||||
// process the previous index
|
||||
TensorOpElement<ElemType, N, M, K, parallelReduce, -1>::Compute(/*id*/0, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
|
||||
}
|
||||
};
|
||||
|
||||
//// apply beta and alpha and save
|
||||
//template<class ElemType, class PointersType>
|
||||
//static __device__ void SetFinalValue(ElemType val, ElemType beta, const PointersType & pointers, ElemType alpha)
|
||||
//{
|
||||
// // scale
|
||||
// val *= alpha;
|
||||
// // combine with previous value in target matrix, then write it out
|
||||
// auto * pout = pointers[pointers.size() - 1];
|
||||
// if (beta != 0)
|
||||
// val += beta * *pout;
|
||||
// // save
|
||||
// *pout = val;
|
||||
//}
|
||||
|
||||
// specialization for k = -1 terminates the template recursion, and computes reductions in a for loop
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K>
|
||||
struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/false, /*k=*/-1>
|
||||
{
|
||||
// template-recursion-teminating version computes the actual value for this output location
|
||||
// now the output pointers point to the right element (input pointers may still iterate for reduction)
|
||||
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, K> & /*regularOpStrides*/, const FixedMatrix<C_int, N, K> & /*regularStrides*/,
|
||||
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides, CUDA_LONG /*reductionBegin*/, CUDA_LONG /*reductionChunkSize*/)
|
||||
{
|
||||
// compute the operation for this output coordinate
|
||||
// This may still involve a reduction over inverse-broadcasting dimensions.
|
||||
ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
|
||||
// scale
|
||||
val *= alpha;
|
||||
// combine with previous value in target matrix, then write it out
|
||||
auto * pout = pointers[pointers.size() - 1];
|
||||
if (beta != 0)
|
||||
val += beta * *pout;
|
||||
// save
|
||||
*pout = val;
|
||||
}
|
||||
};
|
||||
|
||||
// specialization for k = -1 terminates the template recursion, and computes reductions in parallel
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K>
|
||||
struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
|
||||
{
|
||||
// template-recursion-teminating version computes the actual value for this output location
|
||||
// now the output pointers point to the right element (input pointers may still iterate for reduction)
|
||||
static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const FixedArray<C_unsigned_int, K> & /*regularOpStrides*/, const FixedMatrix<C_int, N, K> & /*regularStrides*/,
|
||||
const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides, CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
|
||||
{
|
||||
CUDA_LONG reductionBlock = blockIdx.z; // block index --larger reductions are split into blocks
|
||||
CUDA_LONG reductionBlocks = gridDim.z; // number of blocks
|
||||
CUDA_LONG tid = threadIdx.x; // thread index
|
||||
CUDA_LONG tids = blockDim.x; // out of how many threads --note: last block is partial
|
||||
|
||||
// determine our range --this is a single int mul, we can stomach it (we could alternatively pass in yet another parameter)
|
||||
CUDA_LONG reductionDim = (CUDA_LONG)reducingOpDims[0];
|
||||
for (C_size_t i = 1; i < reducingOpDims.size(); i++)
|
||||
reductionDim *= reducingOpDims[i];
|
||||
|
||||
// determine the redId range that we operate on
|
||||
// Each thread takes a stride tid + (multiples of tids) within this range.
|
||||
reductionBegin += reductionChunkSize * reductionBlock;
|
||||
CUDA_LONG reductionEnd = min(reductionBegin + reductionChunkSize, reductionDim);
|
||||
|
||||
// compute the operation for this input coordinate
|
||||
ReduceElemType sum = 0;
|
||||
for (CUDA_LONG redId = reductionBegin + tid; redId < reductionEnd; redId += tids)
|
||||
{
|
||||
auto val = TensorOpParallelReduce<ElemType, N, M, M - 1>::Compute(redId, pointers, op, reducingOpDims, reducingStrides);
|
||||
sum += val;
|
||||
}
|
||||
|
||||
// reduce --cf https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf
|
||||
__shared__ ReduceElemType accumulators[GridDim::maxThreadsPerBlock/*tids*/];
|
||||
accumulators[tid] = sum;
|
||||
__syncthreads();
|
||||
static_assert(GridDim::maxThreadsPerBlock <= 512, "GridDim::maxThreadsPerBlock too large, need to add manually unrolled steps");
|
||||
for (CUDA_LONG i = 256; i; i >>= 1)
|
||||
{
|
||||
if (tid < i && tid + i < tids) accumulators[tid] += accumulators[tid + i];
|
||||
if (0 + i < tids) __syncthreads(); // sync if condition true for at least one thread
|
||||
// TODO: use volatile* and then we can skip the __syncthreads() for the last 32 values
|
||||
}
|
||||
|
||||
// now set final value to output coordinate
|
||||
if (tid == 0)
|
||||
{
|
||||
ElemType val = (ElemType)accumulators[0];
|
||||
// scale
|
||||
val *= alpha;
|
||||
// combine with previous value in target matrix, then write it out
|
||||
auto * pout = pointers[pointers.size() - 1];
|
||||
if (reductionBlocks > 1) // multiple blocks: need to use atomicAdd()
|
||||
{
|
||||
// in this case, outer calling code must pass beta = 1
|
||||
val = atomicAdd(pout, val);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (beta != 0)
|
||||
val += beta * *pout;
|
||||
// save
|
||||
*pout = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// kernel and launch --no reduction
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// launch tensor op with CUDA
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K>
|
||||
__global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
|
||||
FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides, CUDA_LONG numElements,
|
||||
FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides)
|
||||
{
|
||||
CUDA_LONG id = GridDim::GetLinearThreadId();
|
||||
if (id < numElements) // note: there are no __syncthread() calls inside
|
||||
TensorOpElement<ElemType, N, M, K, false, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, 0, 0);
|
||||
}
|
||||
|
||||
template<class ElemType, C_size_t N, C_int K>
|
||||
static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors)
|
||||
{
|
||||
// copy all parameters to CUDA-compatible data structures
|
||||
FixedArray<ElemType*, N> pointers(pointerVector);
|
||||
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
|
||||
C_size_t numElements = 1;
|
||||
for (C_size_t k = 0; k < regularOpDims.size(); k++)
|
||||
{
|
||||
regularOpStrideVector.push_back(numElements);
|
||||
numElements *= (C_size_t)regularOpDims[k];
|
||||
}
|
||||
FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
|
||||
FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
|
||||
FixedArray<C_unsigned_int, /*M=*/0> reducingOpDims; // empty reduction dimensions
|
||||
FixedMatrix<C_int, N, /*M=*/0> reducingStrides;
|
||||
|
||||
// launch the kernel
|
||||
CUDA_LONG NN = (CUDA_LONG)numElements; // linear space identifying each individual input element
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
GridDim grid(NN);
|
||||
_launchTensorOp<ElemType, N, /*M=*/0, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// kernel and launch --with reduction
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K>
|
||||
__global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
|
||||
FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides, CUDA_LONG numElements,
|
||||
FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides, CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
|
||||
{
|
||||
CUDA_LONG id = gridDim.x * blockIdx.y + blockIdx.x; // input dimensions are Y dimension of blocks in this case, so we can use thread dim for shared-memory/parallelization
|
||||
if (id < numElements) // note: we have __syncthread() calls but only entire blocks in sync, so this is OK
|
||||
TensorOpElement<ElemType, N, M, K, true, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
|
||||
}
|
||||
|
||||
// All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
|
||||
template<class ElemType, C_size_t N, C_int M, C_int K>
|
||||
static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
|
||||
const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
|
||||
{
|
||||
// copy all parameters to CUDA-compatible data structures
|
||||
FixedArray<ElemType*, N> pointers(pointerVector);
|
||||
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
|
||||
C_size_t numElements = 1;
|
||||
for (C_size_t k = 0; k < regularOpDims.size(); k++)
|
||||
{
|
||||
regularOpStrideVector.push_back(numElements);
|
||||
numElements *= (C_size_t)regularOpDims[k];
|
||||
}
|
||||
FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
|
||||
FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
|
||||
FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
|
||||
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
|
||||
|
||||
// launch the kernel
|
||||
CUDA_LONG NN = (CUDA_LONG)numElements; // linear space identifying each individual input element
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
|
||||
// do some optimization for reductions
|
||||
// Cases:
|
||||
// - #output elements >= GPU procs --> use one proc per element, do reduction in inner loop
|
||||
// - reduction dimension fits into a single kernel --> launch it that way
|
||||
// - reduction dimension requires multiple kernels --> use atomic add, to avoid temp mem alloc
|
||||
// - PlusNode: reducing to a bias for small matrices
|
||||
// - ScaleNode: big elementwise product reduced to a scalar (dot product)
|
||||
// - E.g. 3072 GPU procs:
|
||||
// If >= 3072 reduced output values must be computed, just loop inside.
|
||||
// If less, and reduction per value does not fit into a single proc,
|
||||
// then we break it into procs, say, 24.
|
||||
// This way we will need 24 atomicAdd()s of 3072/24 = 128 values.
|
||||
// If reduction is along stride=1, then we'd have 24 atomicAdd()s of 32 coalesced writes.
|
||||
// Does not sound scary at all.
|
||||
// Precondition: matrix cannot at the same time participate in reduction and operation.
|
||||
C_size_t reductionDim = 1; // number of elements to reduce over
|
||||
for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
|
||||
reductionDim *= (C_size_t)reducingOpDimVector[k];
|
||||
let & props = GridDim::GetDeviceProps();
|
||||
GridDim grid(NN);
|
||||
if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount /* && NN == 10 && reductionDim <= GridDim::maxThreadsPerBlock*/)
|
||||
{
|
||||
// we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
|
||||
// Change of strategy: All NN elements get their own block. Reduction gets split over blocks as well.
|
||||
|
||||
// By how much do we underutilize?
|
||||
// We increase #blocks by that factor by breaking reduction into that many chunks.
|
||||
let numReductionChunks = CeilDiv(props.multiProcessorCount, NN);
|
||||
|
||||
// NN may be too large for a single dimension
|
||||
let blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
|
||||
let numBlocksX = CeilDiv(NN, blockXOverBy);
|
||||
let numBlocksY = CeilDiv(NN, numBlocksX);
|
||||
let numBlocksZ = numReductionChunks;
|
||||
// Block dim is now:
|
||||
// - X, Y: such that X*Y covers NN
|
||||
// - Z: reduction chunks
|
||||
|
||||
// reduction goes into thread dim X
|
||||
let reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
|
||||
let numThreadsX = min(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
|
||||
|
||||
if (beta == 1 || numBlocksZ == 1)
|
||||
{
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
// We need more than one chunk, we will use atomicAdd().
|
||||
// First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// we got enough elements to generate: do one element per thread, and reduction inside
|
||||
_launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
|
||||
}
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// kernel and launch --linear unary
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// for linear unary ops, we need to define a functor for every function for use as a template parameter (lambda syntax doesn't work in CUDA 7)
|
||||
#define DefineUnaryTensorFunctor(oper) \
|
||||
struct Functor ## oper { template<class ElemType> static __device__ ElemType f(ElemType a) { return Op ## oper(a); } };
|
||||
ForAllUnaryOps(DefineUnaryTensorFunctor);
|
||||
|
||||
// the top-level kernel for linear unary ops
|
||||
// Note: If we have a beta, we have 2 memory accesses, so this optimization may no longer be needed as we are memory-bound.
|
||||
template<class ElemType, class FN>
|
||||
__global__ void _launchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, CUDA_LONG numElements)
|
||||
{
|
||||
CUDA_LONG id = GridDim::GetLinearThreadId();
|
||||
if (id >= numElements)
|
||||
return;
|
||||
ElemType a = pa[id];
|
||||
ElemType val = FN::f(a);
|
||||
val *= alpha;
|
||||
if (beta != 0)
|
||||
val += beta * pb[id];
|
||||
pb[id] = val;
|
||||
}
|
||||
// version without beta and alpha
|
||||
template<class ElemType, class FN>
|
||||
__global__ void _launchUnaryTensorOp(const ElemType * pa, ElemType * pb, CUDA_LONG numElements)
|
||||
{
|
||||
CUDA_LONG id = GridDim::GetLinearThreadId();
|
||||
if (id >= numElements)
|
||||
return;
|
||||
ElemType a = pa[id];
|
||||
ElemType val = FN::f(a);
|
||||
pb[id] = val;
|
||||
}
|
||||
|
||||
// special case of linear unary operation
|
||||
template<class ElemType>
|
||||
void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim)
|
||||
{
|
||||
//////if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp: %d", (int)__LINE__);
|
||||
CUDA_LONG NN = (CUDA_LONG)regularOpDim;
|
||||
|
||||
#define CaseLaunchUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: \
|
||||
if (beta == 0 && alpha == 1) \
|
||||
return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(pa, pb, NN); \
|
||||
else \
|
||||
return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pa, pb, alpha, NN);
|
||||
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
GridDim grid(NN);
|
||||
switch (op)
|
||||
{
|
||||
ForAllUnaryOps(CaseLaunchUnaryTensorOp);
|
||||
default: LogicError("LaunchTensorOp1: Unknown op code %d.", (int)op);
|
||||
}
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// map runtime parameters N to template parameters
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// tensor operation with k+1 dimensions (-1 means scalar)
|
||||
template<class ElemType, C_size_t N, C_int K>
|
||||
static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrides,
|
||||
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
|
||||
{
|
||||
size_t dims = reducingOpDims.size();
|
||||
switch (dims)
|
||||
{
|
||||
case 2: return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1: return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0: return LaunchTensorOp<ElemType, N, K>(beta, pointers, alpha, op, regularOpDims, regularStrides);
|
||||
default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (C_int)dims);
|
||||
}
|
||||
}
|
||||
|
||||
// tensor operation, generalized in number of arguments
|
||||
// This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
|
||||
template<class ElemType, C_size_t N>
|
||||
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const array<size_t, N> & offsets,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrides,
|
||||
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
|
||||
{
|
||||
for (C_size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
|
||||
pointers[i] += offsets[i];
|
||||
size_t dims = regularOpDims.size();
|
||||
switch (dims)
|
||||
{
|
||||
case 4: return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 3: return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 2: return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 1: return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
case 0: return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (C_int)dims);
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// explicit instantiations--these are being called from GPUMatrix.cu
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
template void TensorOpN<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op,
|
||||
const array<size_t, 2> & offsets,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 2> & regularStrides,
|
||||
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 2> & reducingStrides);
|
||||
template void TensorOpN<float, 3>(float beta, array<float*, 3> pointers, float alpha, ElementWiseOperator op,
|
||||
const array<size_t, 3> & offsets,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 3> & regularStrides,
|
||||
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 3> & reducingStrides);
|
||||
template void TensorOpN<float, 4>(float beta, array<float*, 4> pointers, float alpha, ElementWiseOperator op,
|
||||
const array<size_t, 4> & offsets,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 4> & regularStrides,
|
||||
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 4> & reducingStrides);
|
||||
template void TensorOpN<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op,
|
||||
const array<size_t, 2> & offsets,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 2> & regularStrides,
|
||||
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 2> & reducingStrides);
|
||||
template void TensorOpN<double, 3>(double beta, array<double*, 3> pointers, double alpha, ElementWiseOperator op,
|
||||
const array<size_t, 3> & offsets,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 3> & regularStrides,
|
||||
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 3> & reducingStrides);
|
||||
template void TensorOpN<double, 4>(double beta, array<double*, 4> pointers, double alpha, ElementWiseOperator op,
|
||||
const array<size_t, 4> & offsets,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, 4> & regularStrides,
|
||||
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 4> & reducingStrides);
|
||||
|
||||
template void LaunchUnaryTensorOp(float beta, const float * pa, float * pb, float alpha, ElementWiseOperator op, size_t regularOpDim);
|
||||
template void LaunchUnaryTensorOp(double beta, const double * pa, double * pb, double alpha, ElementWiseOperator op, size_t regularOpDim);
|
||||
|
||||
}}}
|
||||
|
||||
#endif // CPUONLY
|
|
@ -0,0 +1,30 @@
|
|||
//
|
||||
// <copyright file="GPUTensor.h" company="Microsoft">
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// </copyright>
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#include "CommonMatrix.h"
|
||||
#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
|
||||
#include "GPUMatrixCUDAKernels.cuh"
|
||||
#include <array>
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// GPUMatrix::TensorOp() interfaces with actual tensor code through these two functions, which are independent of the GPUMatrix class
|
||||
|
||||
#define C_size_t CUDA_LONG
|
||||
#define C_int CUDA_LONG
|
||||
#define C_unsigned_int CUDA_LONG
|
||||
|
||||
template<class ElemType, C_size_t N>
|
||||
void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
|
||||
const array<size_t, N> & offsets,
|
||||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrides,
|
||||
const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides);
|
||||
|
||||
template<class ElemType>
|
||||
void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim);
|
||||
|
||||
}}}
|
|
@ -156,7 +156,7 @@
|
|||
</ProjectReference>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\Common\Include\DataTensor.h" />
|
||||
<ClInclude Include="..\Common\Include\TensorShape.h" />
|
||||
<ClInclude Include="..\Common\Include\File.h" />
|
||||
<ClInclude Include="..\Common\Include\fileutil.h" />
|
||||
<ClInclude Include="..\Common\Include\DebugUtil.h" />
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<ClCompile Include="dllmain.cpp" />
|
||||
<ClCompile Include="Matrix.cpp" />
|
||||
<ClCompile Include="stdafx.cpp" />
|
||||
<ClCompile Include="..\Common\File.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
|
@ -25,22 +23,31 @@
|
|||
<ClCompile Include="MatrixQuantizerCPU.cpp">
|
||||
<Filter>CPU\1bitSGD</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="MatrixQuantizer.cpp" />
|
||||
<ClCompile Include="QuantizedMatrix.cpp" />
|
||||
<ClCompile Include="CUDAPageLockedMemAllocator.cpp">
|
||||
<Filter>GPU\1bitSGD</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="ConvolutionEngine.cpp" />
|
||||
<ClCompile Include="TensorView.cpp">
|
||||
<Filter>Tensors</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="dllmain.cpp">
|
||||
<Filter>Misc</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="ConvolutionEngine.cpp">
|
||||
<Filter>Convolution</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="stdafx.cpp">
|
||||
<Filter>Misc</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="QuantizedMatrix.cpp">
|
||||
<Filter>1bitSGD</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="MatrixQuantizer.cpp">
|
||||
<Filter>1bitSGD</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="CommonMatrix.h" />
|
||||
<ClInclude Include="Helpers.h" />
|
||||
<ClInclude Include="Matrix.h" />
|
||||
<ClInclude Include="stdafx.h" />
|
||||
<ClInclude Include="targetver.h" />
|
||||
<ClInclude Include="..\Common\Include\File.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
|
@ -59,23 +66,40 @@
|
|||
<ClInclude Include="MatrixQuantizerCPU.h">
|
||||
<Filter>CPU\1bitSGD</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="MatrixQuantizer.h" />
|
||||
<ClInclude Include="QuantizedMatrix.h" />
|
||||
<ClInclude Include="MemAllocator.h" />
|
||||
<ClInclude Include="CUDAPageLockedMemAllocator.h">
|
||||
<Filter>GPU\1bitSGD</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\DebugUtil.h" />
|
||||
<ClInclude Include="ConvolutionEngine.h" />
|
||||
<ClInclude Include="TensorView.h">
|
||||
<Filter>Tensors</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="TensorOps.h">
|
||||
<Filter>Tensors</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\DataTensor.h">
|
||||
<ClInclude Include="..\Common\Include\TensorShape.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Helpers.h">
|
||||
<Filter>Misc</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\DebugUtil.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="ConvolutionEngine.h">
|
||||
<Filter>Convolution</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="stdafx.h">
|
||||
<Filter>Misc</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="targetver.h">
|
||||
<Filter>Misc</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="QuantizedMatrix.h">
|
||||
<Filter>1bitSGD</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="MatrixQuantizer.h">
|
||||
<Filter>1bitSGD</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="GPUMatrix.h">
|
||||
|
@ -113,5 +137,14 @@
|
|||
<Filter Include="Tensors">
|
||||
<UniqueIdentifier>{70fb07cf-603e-4444-bc10-f0add4920fd2}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="Misc">
|
||||
<UniqueIdentifier>{62b92193-92d0-4e5b-8c3e-67ffd01a98c0}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="Convolution">
|
||||
<UniqueIdentifier>{3a49e94d-14ee-4ca1-a56e-a1472206a076}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="1bitSGD">
|
||||
<UniqueIdentifier>{546cacbd-253e-485b-8c8c-8b9ee0e2f631}</UniqueIdentifier>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -157,7 +157,9 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
|
|||
<ClInclude Include="cudalatticeops.h" />
|
||||
<ClInclude Include="cudalib.h" />
|
||||
<ClInclude Include="CuDnnConvolutionEngine.h" />
|
||||
<ClInclude Include="GPUTensor.h" />
|
||||
<ClInclude Include="latticefunctionskernels.h" />
|
||||
<ClInclude Include="TensorOps.h" />
|
||||
<ClInclude Include="ValueQuantizer.h" />
|
||||
<None Include="GPUWatcher.h">
|
||||
<FileType>CppHeader</FileType>
|
||||
|
@ -171,6 +173,10 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
|
|||
<ClInclude Include="targetver.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CudaCompile Include="GPUTensor.cu">
|
||||
<InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</InterleaveSourceInPTX>
|
||||
<Keep Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</Keep>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="cudalatticeops.cu">
|
||||
<FileType>CppCode</FileType>
|
||||
</CudaCompile>
|
||||
|
@ -202,7 +208,7 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
|
|||
<CudaCompile Include="GPUMatrix.cu">
|
||||
<FileType>CppCode</FileType>
|
||||
<Keep Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</Keep>
|
||||
<InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</InterleaveSourceInPTX>
|
||||
<InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</InterleaveSourceInPTX>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="GPUMatrixCUDAKernels.cuh">
|
||||
<ExcludedFromBuild>true</ExcludedFromBuild>
|
||||
|
|
|
@ -22,25 +22,28 @@
|
|||
<CudaCompile Include="GPUMatrixCUDAKernels.cuh">
|
||||
<Filter>GPU</Filter>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="GPUTensor.cu">
|
||||
<Filter>GPU\Tensors</Filter>
|
||||
</CudaCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="stdafx.cpp" />
|
||||
<ClCompile Include="cudalattice.cpp">
|
||||
<Filter>GPU\SequenceTraining</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="cudalib.cpp">
|
||||
<Filter>GPU\SequenceTraining</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Common\DebugUtil.cpp" />
|
||||
<ClCompile Include="..\Common\DebugUtil.cpp">
|
||||
<Filter>Misc</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="stdafx.cpp">
|
||||
<Filter>Misc</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="CuDnnConvolutionEngine.cpp">
|
||||
<Filter>GPU</Filter>
|
||||
<Filter>GPU\Convolution</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="CommonMatrix.h" />
|
||||
<ClInclude Include="Helpers.h" />
|
||||
<ClInclude Include="stdafx.h" />
|
||||
<ClInclude Include="targetver.h" />
|
||||
<ClInclude Include="..\Common\Include\File.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
|
@ -80,8 +83,26 @@
|
|||
<ClInclude Include="latticefunctionskernels.h">
|
||||
<Filter>GPU\SequenceTraining</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GPUTensor.h">
|
||||
<Filter>GPU\Tensors</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Helpers.h">
|
||||
<Filter>Misc</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="stdafx.h">
|
||||
<Filter>Misc</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="targetver.h">
|
||||
<Filter>Misc</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="CommonMatrix.h">
|
||||
<Filter>from Math</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="CuDnnConvolutionEngine.h">
|
||||
<Filter>GPU</Filter>
|
||||
<Filter>GPU\Convolution</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="TensorOps.h">
|
||||
<Filter>from Math</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
|
@ -105,14 +126,23 @@
|
|||
<Filter Include="GPU">
|
||||
<UniqueIdentifier>{cc9a219d-d8ab-484a-b253-fd2a29ad7c7c}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="Include">
|
||||
<UniqueIdentifier>{3c982109-64b1-469a-8d85-2abdf12d636a}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="GPU\1bitSGD">
|
||||
<UniqueIdentifier>{3415233d-9ef7-41c6-abbb-cec1b4f8d14c}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="GPU\SequenceTraining">
|
||||
<UniqueIdentifier>{6a3569b1-6c9e-47b3-870f-bb581349e75e}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="Misc">
|
||||
<UniqueIdentifier>{3c982109-64b1-469a-8d85-2abdf12d636a}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="GPU\Tensors">
|
||||
<UniqueIdentifier>{16214e65-2d24-4e4c-a0dd-c37e505bda32}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="from Math">
|
||||
<UniqueIdentifier>{b1b59e2e-5c54-4e40-ad0a-1523ddeb63ba}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="GPU\Convolution">
|
||||
<UniqueIdentifier>{3155488f-128f-494e-858d-459b4cc9fab7}</UniqueIdentifier>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -3152,6 +3152,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
);
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
bool Matrix<ElemType>::IsValid() const
|
||||
{
|
||||
if (m_currentDataLocation == CurrentDataLocation::GPU && GetMatrixType() == MatrixType::SPARSE)
|
||||
{
|
||||
return this->m_GPUSparseMatrix->IsValid();
|
||||
}
|
||||
else
|
||||
{
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
bool Matrix<ElemType>::IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold /*= 1e-8*/) const
|
||||
{
|
||||
|
@ -4321,7 +4336,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
|
||||
template<class ElemType>
|
||||
void Matrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB,
|
||||
ElemType beta, Matrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
|
||||
ElemType beta, Matrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
|
||||
{
|
||||
DecideAndMoveToRightDevice(a, b, c);
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
#include "Basics.h"
|
||||
#include "File.h"
|
||||
#include "CommonMatrix.h"
|
||||
#include "DataTensor.h" // only for SmallVector; I was hoping to keep this out
|
||||
#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
|
||||
#include <limits.h>
|
||||
#include <memory> // for shared_ptr
|
||||
#include <array>
|
||||
|
@ -348,7 +348,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
Matrix<ElemType>& AssignPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
|
||||
Matrix<ElemType>& AddFoldedPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
|
||||
|
||||
|
||||
bool IsValid() const;
|
||||
bool IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold = 1e-8) const;
|
||||
|
||||
static void VectorSum(const Matrix<ElemType>& a, Matrix<ElemType>& c, const bool isColWise);
|
||||
|
@ -437,7 +438,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
static void Multiply(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
|
||||
static void Multiply(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
|
||||
static void Multiply1x1AndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType beta, Matrix<ElemType>& c);
|
||||
static void ConvolveAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, ElemType beta, Matrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise);
|
||||
static void ConvolveAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, ElemType beta, Matrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise);
|
||||
|
||||
static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
|
||||
static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, ElemType beta, Matrix<ElemType>& c);
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
#include "GPUSparseMatrix.h"
|
||||
#include "MatrixQuantizerGPU.h"
|
||||
#include "CuDnnConvolutionEngine.h"
|
||||
#include "DataTensor.h"
|
||||
#include "TensorShape.h"
|
||||
|
||||
#pragma warning (disable: 4100) // unreferenced formal parameter, which is OK since all functions in here are dummies; disabling this allows to copy-paste prototypes here when we add new functions
|
||||
#pragma warning (disable: 4702) // unreachable code, which we get from the NOT_IMPLEMENTED macro which is OK
|
||||
|
@ -368,10 +368,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template<class ElemType> void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat) {}
|
||||
template<class ElemType> void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const {}
|
||||
|
||||
template<class ElemType> void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise) { };
|
||||
template<class ElemType> void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise) { };
|
||||
template<class ElemType> void GPUSparseMatrix<ElemType>::TensorShuffleScaleAndAdd(ElemType keepWeight, const GPUSparseMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c) { }
|
||||
template<class ElemType> void GPUSparseMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols) { }
|
||||
|
||||
template<class ElemType> bool GPUSparseMatrix<ElemType>::IsValid() const { return true; }
|
||||
|
||||
template<class ElemType> template <class OutType, class InType>
|
||||
void GPUSparseMatrix<ElemType>::CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size){}
|
||||
|
||||
|
|
|
@ -25,18 +25,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// -----------------------------------------------------------------------
|
||||
// unified overloads for float/double math functions
|
||||
//
|
||||
// Declare float and double versions of the functions f we need as f_(),
|
||||
// e.g. exp_ -> exp(double), expf(float).
|
||||
// Declare float and double versions of the functions x we need as x_().
|
||||
// This macro overloads x_() with float and double arguments, and inlines the correct library function,
|
||||
// e.g. exp_ -> exp(double), expf(float). This simplifies templated kernel code.
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#pragma push_macro("OverloadUnaryMathFns")
|
||||
#define OverloadUnaryMathFns(func) \
|
||||
DECL float func ## _(float arg) { return func ## f(arg); } \
|
||||
DECL double func ## _(double arg) { return func(arg); }
|
||||
#define OverloadUnaryMathFns(x) DECL float x ## _(float f) { return x ## f(f); } DECL double x ## _(double f) { return x(f); }
|
||||
|
||||
OverloadUnaryMathFns(exp);
|
||||
OverloadUnaryMathFns(log);
|
||||
OverloadUnaryMathFns(tanh);
|
||||
OverloadUnaryMathFns(sqrt);
|
||||
OverloadUnaryMathFns(fabs);
|
||||
OverloadUnaryMathFns(cos);
|
||||
OverloadUnaryMathFns(sin);
|
||||
|
||||
OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt);
|
||||
OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log);
|
||||
OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin);
|
||||
#pragma push_macro("OverloadUnaryMathFns")
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -46,6 +50,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template<class ElemType>
|
||||
DECL ElemType Sigmoid(ElemType z)
|
||||
{
|
||||
#if 1 // BUGBUG: Numerically bad. But if I don't use this, results change.
|
||||
ElemType negElem = -z;
|
||||
ElemType e = exp_(negElem);
|
||||
|
||||
return 1 / (e + 1);
|
||||
#else
|
||||
#if 1 // Efficient implementation that avoids to divergent CUDA code paths that both compute exp() [jdroppo]. This version compiles to PTX without branches.
|
||||
ElemType q = exp_(-fabs_(z));
|
||||
ElemType numer;
|
||||
|
@ -62,6 +72,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ElemType v = exp_(z);
|
||||
return v / (1 + v);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -85,7 +96,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return sqrt_(z > 0 ? z : 0);
|
||||
}
|
||||
|
||||
// TODO: call this LogAdd() for consistency
|
||||
template<class ElemType>
|
||||
DECL ElemType ClippedLog(ElemType z)
|
||||
{
|
||||
return z < EPS_IN_LOG ? LOG_OF_EPS_IN_LOG : log_(z);
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
DECL ElemType ClippedQuotient(ElemType a, ElemType b)
|
||||
{
|
||||
if (fabs(b) < EPS_IN_INVERSE) // clip the denominator
|
||||
{
|
||||
if (b > 0)
|
||||
b = EPS_IN_INVERSE;
|
||||
else
|
||||
b = -EPS_IN_INVERSE;
|
||||
}
|
||||
return a / b;
|
||||
}
|
||||
|
||||
template<typename ElemType>
|
||||
DECL ElemType LogAdd(ElemType x, ElemType y)
|
||||
{
|
||||
|
@ -105,37 +134,59 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
}
|
||||
|
||||
template<class ElemType> DECL ElemType Sqr(ElemType z) { return z * z; }
|
||||
|
||||
// IndexElement reindexes a tensor along one dimension.
|
||||
// For the indexed dimension, the tensor op is prepared by setting 'a' to be broadcasting along the indexed dimension.
|
||||
// I.e. pa = &a points to the first element (as if index == 0).
|
||||
// This function then must now adjust the address:
|
||||
// pa <- pa + stride * index
|
||||
// The stride is passed in as third parameter.
|
||||
//template<class ElemType> DECL ElemType IndexElement(const ElemType & a, ElemType b, int stride) { const ElemType * pa = &a; return pa[stride * (ptrdiff_t)b]; }
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ElementWiseOperator implementations
|
||||
//
|
||||
// Define a static function for every ElementWiseOperator (CommonMatrix.h).
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#pragma push_macro("DefNullaryOp")
|
||||
#define DefNullaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op() { return expr; }
|
||||
|
||||
DefNullaryOp(ConstOne, 1);
|
||||
#pragma pop_macro("DefNullaryOp")
|
||||
|
||||
#pragma push_macro("DefUnaryOp")
|
||||
#define DefUnaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a) { return expr; }
|
||||
|
||||
DefUnaryOp(Copy, a);
|
||||
DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a);
|
||||
DefUnaryOp(Abs, fabs_(a));
|
||||
DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a));
|
||||
DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, ClippedLog(a)); DefUnaryOp(LinearRectifier, a > 0 ? a : 0); DefUnaryOp(Cosine, cos_(a));
|
||||
#pragma pop_macro("DefUnaryOp")
|
||||
|
||||
// parameterized unary ops
|
||||
//DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha);
|
||||
|
||||
#pragma push_macro("DefBinaryOp")
|
||||
#define DefBinaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b) { return expr; }
|
||||
//#define DefBinaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(const ElemType & a, ElemType b, int i = 0) { UNUSED(i); return expr; }
|
||||
|
||||
DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementwiseProduct, a * b); DefBinaryOp(ElementwiseQuotient, a / b);
|
||||
DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementwiseProduct, a * b); DefBinaryOp(ElementwiseQuotient, ClippedQuotient(a, b));
|
||||
DefBinaryOp(LogSum, LogAdd(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b);
|
||||
DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b);
|
||||
DefBinaryOp(And, (float)((!!a) && (!!b))); DefBinaryOp(Or, (float)((!!a) || (!!b))); DefBinaryOp(Xor, (float)((!!a) ^ (!!b)));
|
||||
DefBinaryOp(MaskNegative, b >= 0 ? a : 0);
|
||||
DefBinaryOp(ElementwiseProductWithSigmoidDerivativeFromOutput, a * (b * (1 - b))); // b = output
|
||||
DefBinaryOp(ElementwiseProductWithTanhDerivativeFromOutput, a * (1 - b * b));
|
||||
DefBinaryOp(ElementwiseProductWithLinearRectifierDerivativeFromOutput, b > 0 ? a : 0);
|
||||
DefBinaryOp(ElementwiseProductWithLogDerivativeFromOutput, a * exp_(-b));
|
||||
DefBinaryOp(ElementwiseProductWithCosDerivative, a * -sin_(b)); // note: b = input for cos()
|
||||
//DefBinaryOp(Index, IndexElement(a, b, i)); // note: this one uses the third argument
|
||||
|
||||
#pragma pop_macro("DefBinaryOp")
|
||||
|
||||
#pragma push_macro("DefTernaryOp")
|
||||
#define DefTernaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; }
|
||||
|
||||
DefTernaryOp(Cond, a ? b : c);
|
||||
DefTernaryOp(Cond, a ? b : c); DefTernaryOp(Clip, a < b ? b : (a > c ? c : a));
|
||||
#pragma pop_macro("DefTernaryOp")
|
||||
|
||||
}}}
|
||||
|
|
|
@ -223,6 +223,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
offsets[i] = shapes[i].GetOffset();
|
||||
}
|
||||
|
||||
// enforce that in case of broadcasting, the output must not be an input
|
||||
template<class ElemType>
|
||||
static bool CheckDifferentObject(const TensorView<ElemType> & a, const TensorView<ElemType> & b)
|
||||
{
|
||||
if (&a == &b)
|
||||
LogicError("Do{U,Bi,Ter}naryOpOf: When inverse broadcasting, output must not be an input.");
|
||||
return true;
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
|
||||
{
|
||||
|
@ -235,6 +244,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
SmallVector<size_t> regularOpDims, reducingOpDims;
|
||||
PrepareTensorOperands<ElemType,2>(array<TensorShape, 2> { a.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
|
||||
// output cannot be input when reducing
|
||||
if (reducingOpDims.size() > 0)
|
||||
CheckDifferentObject(a, *this);
|
||||
|
||||
// now perform the operation
|
||||
GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
@ -250,6 +263,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
SmallVector<size_t> regularOpDims, reducingOpDims;
|
||||
PrepareTensorOperands<ElemType, 3>(array<TensorShape, 3> { a.GetShape(), b.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
|
||||
// output cannot be input when reducing
|
||||
if (reducingOpDims.size() > 0)
|
||||
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this);
|
||||
|
||||
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
|
@ -264,6 +281,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
SmallVector<size_t> regularOpDims, reducingOpDims;
|
||||
PrepareTensorOperands<ElemType, 4>(array<TensorShape, 4> { a.GetShape(), b.GetShape(), c.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
|
||||
// output cannot be input when reducing
|
||||
if (reducingOpDims.size() > 0)
|
||||
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this) && CheckDifferentObject(c, *this);
|
||||
|
||||
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
#include "Basics.h"
|
||||
#include "Matrix.h"
|
||||
#include "DataTensor.h"
|
||||
#include "TensorShape.h"
|
||||
|
||||
#pragma warning (push)
|
||||
#pragma warning (disable: 4251) // needs to have dll-interface to be used by clients of... caused by TensorView::m_shape which is only private. We use the same compiler everywhere.
|
||||
|
@ -48,7 +48,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// c.AssignDiffOf(c,a) means c -= a,
|
||||
// and c.AddElementwiseProductOf(a, b, 1) means c += a .* b.
|
||||
// All operators support elementwise in-place operations, i.e. a, b, and c
|
||||
// may all reference the same underlying SOB.
|
||||
// may all reference the same underlying SOB, with onee exception:
|
||||
// The output cannot be in-place and inverse-broadcasting at the same time.
|
||||
// E.g. with c=[10] and a=[10 x 20], c.AssignDiffOf(c,a) will fail.
|
||||
// In that case, you can use c.AddCopyOf(a,-1).
|
||||
// Aliasing is not detected, so don't pass distinct TensorView objects that
|
||||
// reference overlapping but not identical slices.
|
||||
// If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
|
@ -59,7 +64,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void Add ## oper ## Of( const TensorView & a, ElemType alpha = 1.0f) { DoUnaryOpOf(1.0f, a, alpha, ElementWiseOperator::op ## oper); }
|
||||
|
||||
ForAllUnaryOps(DeclareUnaryTensorOp);
|
||||
ForAllParameterizedUnaryOps(DeclareUnaryTensorOp);
|
||||
#pragma pop_macro("DeclareUnaryTensorOp")
|
||||
|
||||
#pragma push_macro("DeclareBinaryTensorOp")
|
||||
|
@ -82,12 +86,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
static void Test();
|
||||
|
||||
private:
|
||||
|
||||
void DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op);
|
||||
void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op);
|
||||
void DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op);
|
||||
|
||||
private:
|
||||
|
||||
// -------------------------------------------------------------------
|
||||
// accessors
|
||||
// -------------------------------------------------------------------
|
||||
|
|
|
@ -2593,6 +2593,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// the total number of epochs to run.
|
||||
m_maxEpochs = configSGD(L"maxEpochs");
|
||||
|
||||
// Note: Momentum is best specified as a MB-size agnostic fashion.
|
||||
// Because momentum per sample is a number very close to 1, it is more handy to use a logarithmic specification.
|
||||
// We use 'momentumAsTimeConstant' to specify the time constant of the low-pass filter that momentum really is.
|
||||
// To convert a typical per-MB momentum value of 'm' used with a MB size of 'N', use momentumAsTimeConstant = -N/ln(m).
|
||||
// For the common configuration of momentum 0.9 at MB size of 256, that is momentumAsTimeConstant = 2429.8.
|
||||
floatargvector momentumPerMB = configSGD(L"momentumPerMB", ConfigRecordType::Array(floatargvector()));
|
||||
floatargvector momentumPerSample = configSGD(L"momentumPerSample", ConfigRecordType::Array(floatargvector()));
|
||||
floatargvector momentumAsTimeConstant = configSGD(L"momentumAsTimeConstant", ConfigRecordType::Array(floatargvector()));
|
||||
|
|
|
@ -156,7 +156,7 @@
|
|||
<ClInclude Include="..\Common\Include\BestGpu.h" />
|
||||
<ClInclude Include="..\Common\Include\Config.h" />
|
||||
<ClInclude Include="..\Common\Include\DataReader.h" />
|
||||
<ClInclude Include="..\Common\Include\DataTensor.h" />
|
||||
<ClInclude Include="..\Common\Include\TensorShape.h" />
|
||||
<ClInclude Include="..\Common\Include\DataWriter.h" />
|
||||
<ClInclude Include="..\Common\Include\File.h" />
|
||||
<ClInclude Include="..\Common\Include\fileutil.h" />
|
||||
|
|
|
@ -141,7 +141,7 @@
|
|||
<ClInclude Include="..\Common\Include\Sequences.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\DataTensor.h">
|
||||
<ClInclude Include="..\Common\Include\TensorShape.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\Config.h">
|
||||
|
@ -195,4 +195,4 @@
|
|||
<UniqueIdentifier>{ae1eea3c-d77f-46ec-bf4f-1cd093a295e8}</UniqueIdentifier>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -6,7 +6,7 @@ ndlMnistMacros = [
|
|||
ImageH = 28
|
||||
LabelDim = 10
|
||||
|
||||
features = ImageInput(ImageW, ImageH, 1, tag="feature")
|
||||
features = ImageInput(ImageW, ImageH, 1, imageLayout="legacy", tag="feature")
|
||||
featScale = Const(0.00390625)
|
||||
featScaled = Scale(featScale, features)
|
||||
labels = Input(LabelDim, tag="label")
|
||||
|
@ -28,7 +28,7 @@ DNN=[
|
|||
pool1H = 2
|
||||
pool1hStride = 2
|
||||
pool1vStride = 2
|
||||
pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride)
|
||||
pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout="legacy")
|
||||
|
||||
# conv2
|
||||
kW2 = 5
|
||||
|
@ -45,7 +45,7 @@ DNN=[
|
|||
pool2H = 2
|
||||
pool2hStride = 2
|
||||
pool2vStride = 2
|
||||
pool2 = AveragePooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride)
|
||||
pool2 = AveragePooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout="legacy")
|
||||
|
||||
h1Dim = 128
|
||||
# DNNSigmoidLayer and DNNLayer are defined in Macros.ndl
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# Sigmoid non-linearity
|
||||
DNNSigmoidLayer(inDim, outDim, x, parmScale) = [
|
||||
W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale)
|
||||
b = Parameter(outDim, 1, init="uniform", initValueScale=parmScale)
|
||||
|
@ -6,6 +7,7 @@ DNNSigmoidLayer(inDim, outDim, x, parmScale) = [
|
|||
y = Sigmoid(z)
|
||||
]
|
||||
|
||||
# no non-linearity, as input for SoftMax
|
||||
DNNLayer(inDim, outDim, x, parmScale) = [
|
||||
W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale)
|
||||
b = Parameter(outDim, 1, init="uniform", initValueScale=parmScale)
|
||||
|
@ -13,10 +15,11 @@ DNNLayer(inDim, outDim, x, parmScale) = [
|
|||
z = Plus(t, b)
|
||||
]
|
||||
|
||||
# ReLU non-linearity
|
||||
ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
|
||||
convW = Parameter(outMap, inWCount, init="uniform", initValueScale=wScale)
|
||||
conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false)
|
||||
convB = Parameter(outMap, 1, init="fixedValue", value=bValue)
|
||||
convB = ImageParameter(1, 1, outMap, imageLayout="legacy", init="fixedValue", value=bValue)
|
||||
convPlusB = Plus(conv, convB);
|
||||
act = RectifiedLinear(convPlusB);
|
||||
]
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
#precision = "double"
|
||||
precision = "float"
|
||||
command = train:test
|
||||
deviceId = $DeviceId$
|
||||
|
||||
useCuDnn = true # can be overridden by the command line
|
||||
|
||||
ndlMacros = "$ConfigDir$/Macros.ndl"
|
||||
|
||||
parallelTrain = false
|
||||
|
@ -13,8 +16,94 @@ train = [
|
|||
#deviceId = $DeviceId$
|
||||
traceLevel = 1
|
||||
|
||||
NDLNetworkBuilder = [
|
||||
networkDescription = "$ConfigDir$/Convolution.ndl"
|
||||
#NDLNetworkBuilder = [
|
||||
# networkDescription = "$ConfigDir$/Convolution.ndl"
|
||||
#]
|
||||
|
||||
BrainScriptNetworkBuilder = [
|
||||
|
||||
useCuDnn = $useCuDnn$
|
||||
|
||||
// HACK to enforce same evaluation order or LearnableParameters as for NDL, as to get same radomization
|
||||
// Nodes are evaluated in sorting order.
|
||||
A1 = conv1_act; A2 = conv2_act; A3 = h1 ; A5 = ol
|
||||
|
||||
// macros
|
||||
ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [ // ReLU non-linearity
|
||||
convW = Parameter(outMap, inWCount, init="uniform", initValueScale=wScale, initOnCPUOnly=false)
|
||||
conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false, imageLayout=if useCuDnn then "cudnn" else "legacy")
|
||||
convB = if useCuDnn
|
||||
then ParameterTensor((1 : 1 : outMap : 1/*col dim*/), init="fixedValue", value=bValue)
|
||||
else Parameter(outMap, 1, init="fixedValue", value=bValue)
|
||||
convPlusB = Plus(conv, convB);
|
||||
out = RectifiedLinear(convPlusB);
|
||||
]
|
||||
|
||||
DNNSigmoidLayer(inDim, outDim, x, parmScale) = [ // Sigmoid non-linearity
|
||||
W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=false)
|
||||
b = Parameter(outDim, 1, init="uniform", initValueScale=parmScale, initOnCPUOnly=false)
|
||||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
out = Sigmoid(z)
|
||||
]
|
||||
|
||||
DNNLayer(inDim, outDim, x, parmScale) = [ //no non-linearity, as input for SoftMax
|
||||
W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=false)
|
||||
b = Parameter(outDim, 1, init="uniform", initValueScale=parmScale, initOnCPUOnly=false)
|
||||
t = Times(W, x)
|
||||
out = Plus(t, b)
|
||||
]
|
||||
|
||||
imageW = 28
|
||||
imageH = 28
|
||||
labelDim = 10
|
||||
|
||||
features = ImageInput(imageW, imageH, 1, imageLayout=if useCuDnn then "cudnn" else "legacy", tag="feature")
|
||||
featScale = Constant(0.00390625)
|
||||
featScaled = Scale(featScale, features)
|
||||
labels = Input(labelDim, tag="label")
|
||||
|
||||
# conv1
|
||||
kW1 = 5
|
||||
kH1 = 5
|
||||
cMap1 = 16
|
||||
hStride1 = 1
|
||||
vStride1 = 1
|
||||
# weight[cMap1, kW1 * kH1 * inputChannels]
|
||||
conv1_act = ConvReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, 10, 1).out
|
||||
|
||||
# pool1
|
||||
pool1W = 2
|
||||
pool1H = 2
|
||||
pool1hStride = 2
|
||||
pool1vStride = 2
|
||||
pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=if useCuDnn then "cudnn" else "legacy")
|
||||
|
||||
# conv2
|
||||
kW2 = 5
|
||||
kH2 = 5
|
||||
cMap2 = 32
|
||||
hStride2 = 1
|
||||
vStride2 = 1
|
||||
# weight[cMap2, kW2 * kH2 * cMap1]
|
||||
# ConvReLULayer is defined in Macros.ndl
|
||||
conv2_act = ConvReLULayer(pool1, cMap2, 400, kW2, kH2, hStride2, vStride2, 10, 1).out
|
||||
|
||||
# pool2
|
||||
pool2W = 2
|
||||
pool2H = 2
|
||||
pool2hStride = 2
|
||||
pool2vStride = 2
|
||||
pool2 = AveragePooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout=if useCuDnn then "cudnn" else "legacy")
|
||||
|
||||
h1Dim = 128
|
||||
# DNNSigmoidLayer and DNNLayer are defined in Macros.ndl
|
||||
h1 = DNNSigmoidLayer(512, h1Dim, pool2, 1).out
|
||||
ol = DNNLayer(h1Dim, labelDim, h1, 1).out
|
||||
|
||||
ce = CrossEntropyWithSoftmax(labels, ol, tag="criterion")
|
||||
err = ErrorPrediction(labels, ol, tag="eval")
|
||||
outputNodes = ol
|
||||
]
|
||||
|
||||
SGD = [
|
||||
|
|
|
@ -66,8 +66,8 @@ speechTrain = [
|
|||
C(c) = DiagTimes(WeightParam(cellDim, 1), Stabilize(c)) // cell-to-hiddden
|
||||
|
||||
// LSTM cell
|
||||
dh = PastValue(outputDim, 1, output); // hidden state(t-1)
|
||||
dc = PastValue(cellDim, 1, ct); // cell(t-1)
|
||||
dh = PastValue(outputDim, output); // hidden state(t-1)
|
||||
dc = PastValue(cellDim, ct); // cell(t-1)
|
||||
|
||||
// note: the W(inputx) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B()
|
||||
it = Sigmoid(W(inputx) + B() + H(dh) + C(dc)) // input gate(t)
|
||||
|
@ -95,8 +95,8 @@ speechTrain = [
|
|||
numLSTMs = 3 // number of hidden LSTM model layers
|
||||
|
||||
// features
|
||||
features = Input(featDim, 1, tag='feature')
|
||||
labels = Input(labelDim, 1, tag='label')
|
||||
features = Input(featDim, tag='feature')
|
||||
labels = Input(labelDim, tag='label')
|
||||
feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); # shift 5 frames right (x_{t+5} -> x_{t} ) // TODO why 5? Where do I see this?
|
||||
|
||||
featNorm = MeanVarNorm(feashift)
|
||||
|
|
|
@ -74,8 +74,8 @@ speechTrain = new TrainAction [
|
|||
C(c) = DiagTimes(WeightParam(cellDim, 1), Stabilize(c)) // cell-to-hiddden
|
||||
|
||||
// LSTM cell
|
||||
dh = PastValue(outputDim, 1, output); // hidden state(t-1)
|
||||
dc = PastValue(cellDim, 1, ct); // cell(t-1)
|
||||
dh = PastValue(outputDim, output); // hidden state(t-1)
|
||||
dc = PastValue(cellDim, ct); // cell(t-1)
|
||||
|
||||
// note: the W(inputx) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B()
|
||||
it = Sigmoid(W(inputx) + B() + H(dh) + C(dc)) // input gate(t)
|
||||
|
|
|
@ -27,6 +27,8 @@ Using parallel sequences (difference to above: nbruttsineachrecurrentiter=4). No
|
|||
|
||||
COMMAND: currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data configFile=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM\cntk.config stderr=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log RunDir=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance NdlDir=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM DataDir=. DeviceId=auto Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=4]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[learningRatesPerMB=0.125]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] makeMode=false
|
||||
|
||||
Linux: bin/cntk currentDirectory=Tests/EndToEndTests/Speech/Data configFile=../LSTM/cntk.config stderr=../RunDir/LSTM/Truncated/models/cntkSpeech.dnn.log RunDir=../RunDir/LSTM/Truncated NdlDir=../LSTM DataDir=. DeviceId=auto Truncated=false 'speechTrain=[reader=[nbruttsineachrecurrentiter=4]]' 'speechTrain=[SGD=[epochSize=2560]]' 'speechTrain=[SGD=[learningRatesPerMB=0.125]]' 'speechTrain=[SGD=[maxEpochs=2]]' 'speechTrain=[SGD=[numMBsToShowResult=1]]' makeMode=false
|
||||
|
||||
Using full BrainScript configuration
|
||||
|
||||
COMMAND: --cd $(SolutionDir)Tests\EndToEndTests\Speech\Data -f $(SolutionDir)Tests\EndToEndTests\Speech\LSTM\lstm.bs -D stderr='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log' -D RunDir='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance' -D NdlDir='$(SolutionDir)Tests\EndToEndTests\Speech\LSTM' -D DataDir='.' -D DeviceId='Auto' -D Truncated=false -D speechTrain=[reader=[nbruttsineachrecurrentiter=1];SGD=[epochSize=2560;maxEpochs=2;numMBsToShowResult=1]] -D makeMode=false
|
||||
|
@ -46,7 +48,7 @@ COMMAND: currentDirectory=$(SolutionDir)ExampleSetups\Image\MNIST configFil
|
|||
|
||||
--- Image/QuickE2E:
|
||||
|
||||
COMMAND: configFile=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E\cntk.config RunDir=$(SolutionDir)Tests\EndToEndTests\Image\_run DataDir=$(SolutionDir)Tests\EndToEndTests\Image\Data ConfigDir=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E stderr=$(SolutionDir)Tests\EndToEndTests\RunDir\Image\QuickE2E\models\cntkImage.dnn.log DeviceId=0 makeMode=false
|
||||
COMMAND: configFile=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E\cntk.config RunDir=$(SolutionDir)Tests\EndToEndTests\Image\_run DataDir=$(SolutionDir)Tests\EndToEndTests\Image\Data ConfigDir=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E stderr=$(SolutionDir)Tests\EndToEndTests\RunDir\Image\QuickE2E\models\cntkImage.dnn.log DeviceId=0 useCuDnn=false makeMode=false
|
||||
|
||||
Simple test
|
||||
-----------
|
||||
|
|
|
@ -24,14 +24,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
|
|||
|
||||
static bool IsCuDnnSupported()
|
||||
{
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
try
|
||||
{
|
||||
return ConvFact::Create(0, ConvFact::EngineType::CuDnn) != nullptr;
|
||||
// TODO: Will this ever return nullptr?
|
||||
return ConvFact::Create(0, ConvFact::EngineType::CuDnn, ImageLayoutKind::CHW) != nullptr;
|
||||
}
|
||||
catch (std::runtime_error)
|
||||
{
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
return false;
|
||||
}
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(ConvolutionSuite)
|
||||
|
@ -55,7 +59,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
|
|||
|
||||
for (int deviceId : { 0 })
|
||||
{
|
||||
auto fact = ConvFact::Create(deviceId);
|
||||
// BUGBUG: These will fail depending on whether we built with cuDNN or not. Without cuDNN we should use HWC
|
||||
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
|
||||
auto tt = typeid(fact).name();
|
||||
UNUSED(tt);
|
||||
auto eng = fact->CreateConvEngine(deviceId, 0);
|
||||
|
@ -128,14 +133,22 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
|
|||
|
||||
for (int deviceId : { -1, 0 })
|
||||
{
|
||||
auto fact = ConvFact::Create(deviceId);
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, deviceId >= 0 ? ImageLayoutKind::CHW : ImageLayoutKind::HWC);
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
auto eng = fact->CreateConvEngine(deviceId, 0);
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
auto inT = fact->CreateTensor(inW, inH, cmapIn, n);
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
auto outT = fact->CreateTensor(outW, outH, cmapOut, n);
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
auto convT = fact->CreateConvDescriptor(*inT, *filtT, sW, sH, pad);
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
|
||||
// Input in NCHW format.
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
SingleMatrix in(inW * inH * cmapIn, n, vec(inW * inH * cmapIn * n, 1.0f).data(), matrixFlagNormal, deviceId);
|
||||
// Create cmapOut filters, each kW x kH x cmapIn (NCHW format).
|
||||
SingleMatrix filt(cmapOut, kW * kH * cmapIn, vec(kW * kH * cmapIn * cmapOut, 1.0f).data(), matrixFlagNormal, deviceId);
|
||||
|
@ -143,7 +156,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
|
|||
SingleMatrix out(outW * outH * cmapOut, n, deviceId);
|
||||
SingleMatrix temp(deviceId);
|
||||
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
eng->Forward(*inT, in, *filtT, filt, *convT, *outT, out, temp);
|
||||
fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
|
||||
|
||||
// Output is in NCHW format.
|
||||
float expBuf[] = {
|
||||
|
@ -175,7 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
|
|||
|
||||
for (int deviceId : { 0 })
|
||||
{
|
||||
auto fact = ConvFact::Create(deviceId);
|
||||
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
|
||||
auto eng = fact->CreateConvEngine(deviceId, 0);
|
||||
auto srcGradT = fact->CreateTensor(outW, outH, cmapOut, n);
|
||||
auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
|
||||
|
@ -231,7 +246,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
|
|||
|
||||
for (int deviceId : { 0 })
|
||||
{
|
||||
auto fact = ConvFact::Create(deviceId);
|
||||
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
|
||||
auto eng = fact->CreateConvEngine(deviceId, 0);
|
||||
auto srcGradT = fact->CreateTensor(outW, outH, cmapOut, n);
|
||||
auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
|
||||
|
@ -296,7 +311,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
|
|||
|
||||
for (int deviceId : { 0 })
|
||||
{
|
||||
auto fact = ConvFact::Create(deviceId);
|
||||
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
|
||||
auto eng = fact->CreatePoolEngine(deviceId);
|
||||
auto inT = fact->CreateTensor(inW, inH, cmap, n);
|
||||
auto outT = fact->CreateTensor(outW, outH, cmap, n);
|
||||
|
@ -346,7 +361,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
|
|||
|
||||
for (int deviceId : { 0 })
|
||||
{
|
||||
auto fact = ConvFact::Create(deviceId);
|
||||
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
|
||||
auto eng = fact->CreatePoolEngine(deviceId);
|
||||
auto inT = fact->CreateTensor(inW, inH, cmap, n);
|
||||
auto outT = fact->CreateTensor(outW, outH, cmap, n);
|
||||
|
@ -406,7 +421,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
|
|||
|
||||
for (int deviceId : { 0 })
|
||||
{
|
||||
auto fact = ConvFact::Create(deviceId);
|
||||
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
|
||||
auto eng = fact->CreatePoolEngine(deviceId);
|
||||
auto inT = fact->CreateTensor(inW, inH, cmap, n);
|
||||
auto outT = fact->CreateTensor(outW, outH, cmap, n);
|
||||
|
@ -456,7 +471,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
|
|||
|
||||
for (int deviceId : { 0 })
|
||||
{
|
||||
auto fact = ConvFact::Create(deviceId);
|
||||
auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
|
||||
auto eng = fact->CreatePoolEngine(deviceId);
|
||||
auto inT = fact->CreateTensor(inW, inH, cmap, n);
|
||||
auto outT = fact->CreateTensor(outW, outH, cmap, n);
|
||||
|
|
|
@ -535,6 +535,19 @@ namespace Microsoft
|
|||
BOOST_CHECK(m1.IsEqualTo(m2));
|
||||
}
|
||||
|
||||
#if 0 // Temporarily disabling
|
||||
BOOST_FIXTURE_TEST_CASE(GPUMatrixLargeInequality, RandomSeedFixture)
|
||||
{
|
||||
const int rows = 33553921;
|
||||
const int cols = 1;
|
||||
|
||||
auto m0 = GPUMatrix<float>::Zeros(rows, cols, c_deviceIdZero);
|
||||
auto m1 = GPUMatrix<float>::Ones(rows, cols, c_deviceIdZero);
|
||||
|
||||
BOOST_CHECK(!m1.IsEqualTo(m0, c_epsilonFloatE5));
|
||||
}
|
||||
#endif
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -493,34 +493,22 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrix1DConvolutionRandomInit, RandomSeedFixtu
|
|||
}
|
||||
}
|
||||
|
||||
#if 0 // Temporarily disabling
|
||||
BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrixLargeIsEqual, RandomSeedFixture)
|
||||
{
|
||||
const int rows = 33553921;
|
||||
const int cols = 1;
|
||||
|
||||
Matrix<float> m0 = Matrix<float>::Zeros(rows, cols, c_deviceIdZero);
|
||||
Matrix<float> m1 = Matrix<float>::Ones(rows, cols, c_deviceIdZero);
|
||||
|
||||
BOOST_CHECK(!m1.IsEqualTo(m0, c_epsilonFloatE5));
|
||||
}
|
||||
|
||||
BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrix1DConvolutionBackprop, RandomSeedFixture)
|
||||
{
|
||||
const int inChannels = 2;// 50;
|
||||
const int inWidth = 4;// 10;
|
||||
const int inChannels = 50;
|
||||
const int inWidth = 10;
|
||||
const int inHeight = 1;
|
||||
const int batchSize = 3;// 20;
|
||||
const int kernelWidth = 2;// 3;
|
||||
const int batchSize = 20;
|
||||
const int kernelWidth = 3;
|
||||
const int kernelHeight = inHeight;
|
||||
const int horizontalSubsample = 1;
|
||||
const int verticalSubsample = 1;
|
||||
const bool zeroPadding = false;
|
||||
const int outChannels = 2;// 3;
|
||||
const int outWidth = zeroPadding ? inWidth : (inWidth >= kernelWidth ? 1 + (inWidth - kernelWidth) / horizontalSubsample : 0);
|
||||
const int outChannels = 3;
|
||||
const int outWidth = zeroPadding ? (inWidth / horizontalSubsample) : (inWidth >= kernelWidth ? 1 + (inWidth - kernelWidth) / horizontalSubsample : 0);
|
||||
const int outHeight = inHeight;
|
||||
const float randomInitLowerBound = 1.0f;
|
||||
const float randomInitUpperBound = 5.0f;
|
||||
const float randomInitLowerBound = -1.0f;
|
||||
const float randomInitUpperBound = 1.0f;
|
||||
Matrix<float> outputGradientSubBatch = Matrix<float>::RandomUniform(outChannels, batchSize*outWidth, randomInitLowerBound, randomInitUpperBound, IncrementCounter(), c_deviceIdZero);
|
||||
Matrix<float> inputSubBatch = Matrix<float>::RandomUniform(inChannels*inWidth, batchSize, randomInitLowerBound, randomInitUpperBound, IncrementCounter(), c_deviceIdZero);
|
||||
Matrix<float> tempMatrix(1, 1, c_deviceIdZero);
|
||||
|
@ -550,30 +538,8 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrix1DConvolutionBackprop, RandomSeedFixture
|
|||
Matrix<float>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, inputGradientValues2, batchSize, horizontalSubsample, zeroPadding, false);
|
||||
inputGradientValues2.Reshape(outChannels, inChannels*kernelWidth);
|
||||
|
||||
const int dim = outChannels*inChannels*kernelWidth;
|
||||
float* base = inputGradientValues1.CopyToArray();
|
||||
float baseA[dim];
|
||||
fprintf(stderr, "[BASE]");
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
baseA[i] = base[i];
|
||||
fprintf(stderr, "%f ", baseA[i]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
float* exp = inputGradientValues2.CopyToArray();
|
||||
float expA[dim];
|
||||
fprintf(stderr, "[EXP]");
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
expA[i] = exp[i];
|
||||
fprintf(stderr, "%f ", expA[i]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
BOOST_CHECK(inputGradientValues2.IsEqualTo(inputGradientValues1, c_epsilonFloatE5));
|
||||
BOOST_CHECK(inputGradientValues2.IsEqualTo(inputGradientValues1, c_epsilonFloatE2));
|
||||
}
|
||||
#endif
|
||||
|
||||
BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrixReshape, RandomSeedFixture)
|
||||
{
|
||||
|
@ -595,10 +561,10 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrixReshape, RandomSeedFixture)
|
|||
BOOST_CHECK(denseMatrixC.IsEqualTo(denseMatrixB, c_epsilonFloatE5));
|
||||
BOOST_CHECK(!denseMatrixC.IsEqualTo(denseMatrixA, c_epsilonFloatE5));
|
||||
}
|
||||
#if 0
|
||||
|
||||
BOOST_FIXTURE_TEST_CASE(GPUSSparseTensorShuffleScaleAndAdd, RandomSeedFixture)
|
||||
{
|
||||
size_t D = 10, S = 10, M = 10, K = 10, T = 10;
|
||||
size_t D = 13, S = 11, M = 7, K = 15, T = 8;
|
||||
GPUMatrix<float> denseMatrixA = GPUMatrix<float>::RandomUniform(D * S * M * K, T, c_deviceIdZero, -1, 1, IncrementCounter());
|
||||
GPUMatrix<float> denseMatrixB(D*S*M*K, T, c_deviceIdZero);
|
||||
GPUMatrix<float> denseMatrixC(D*S*M*K, T, c_deviceIdZero);
|
||||
|
@ -612,7 +578,7 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseTensorShuffleScaleAndAdd, RandomSeedFixture)
|
|||
|
||||
BOOST_CHECK(denseMatrixC.IsEqualTo(denseMatrixB, c_epsilonFloatE5));
|
||||
}
|
||||
#endif
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
} } } }
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче