Merge branch 'dongyu/needsGradientChange' of https://github.com/Microsoft/CNTK into fseide/s2s

Conflicts:
	Source/CNTK/tests.cpp
	Source/ComputationNetworkLib/ComputationNetwork.h
	Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
	Source/ComputationNetworkLib/InputAndParamNodes.h
	Source/ComputationNetworkLib/PreComputeNodes.h
This commit is contained in:
Dong Yu 2016-02-24 13:55:41 -08:00
Родитель 14e005a982 aa74337d38
Коммит 32004edfa2
18 изменённых файлов: 64 добавлений и 88 удалений

Просмотреть файл

@ -1652,7 +1652,7 @@ LearnableParameter(row, cols,
\begin_layout Plain Layout
Parameter(row, cols, {needGradient=true|false,
Parameter(row, cols, {needsGradient=true|false,
\end_layout
\begin_layout Plain Layout
@ -1680,7 +1680,7 @@ cols - number of columns in the parameter, defaults to 1.
\end_layout
\begin_layout Itemize
needGradient- [named optional] determines whether the parameter should be
needsGradient- [named optional] determines whether the parameter should be
updated by the training algorithm.
Defaults is true.
\end_layout
@ -4221,7 +4221,7 @@ features=InputValue [784,32]
\begin_layout Plain Layout
L1.BFF.B=LearnableParameter [256,1] NeedGradient=true
L1.BFF.B=LearnableParameter [256,1] NeedsGradient=true
\end_layout
\begin_layout Plain Layout
@ -4266,7 +4266,7 @@ L1.BFF.FF.T=Times ( L1.BFF.W , normInput )
\begin_layout Plain Layout
L1.BFF.W=LearnableParameter [256,784] NeedGradient=true
L1.BFF.W=LearnableParameter [256,784] NeedsGradient=true
\end_layout
\begin_layout Plain Layout

Просмотреть файл

@ -4023,7 +4023,7 @@ In many cases, not all the gradients need to be computed.
updated and thus it is unnecessary to compute the gradients with regard
to these parameters.
We can reduce the gradient computation by keeping a
\begin_inset Formula $needGradient$
\begin_inset Formula $needsGradient$
\end_inset
flag for each node.
@ -4032,7 +4032,7 @@ In many cases, not all the gradients need to be computed.
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-NeedGradient"
reference "alg:CN-needsGradient"
\end_inset
@ -4047,7 +4047,7 @@ reference "alg:CN-ForwardOrder-DAG"
and
\begin_inset CommandInset ref
LatexCommand ref
reference "alg:CN-NeedGradient"
reference "alg:CN-needsGradient"
\end_inset
@ -4101,7 +4101,7 @@ status collapsed
\end_inset
UpdateNeedGradientFlag
UpdateneedsGradientFlag
\begin_inset ERT
status collapsed
@ -4427,7 +4427,7 @@ State
call
\noun on
UpdateNeedGradientFlag
UpdateneedsGradientFlag
\noun default
(
\begin_inset Formula $c$
@ -4511,7 +4511,7 @@ status open
\end_inset
\begin_inset Formula $node.AnyChildNeedGradient()$
\begin_inset Formula $node.AnyChildneedsGradient()$
\end_inset
@ -4566,7 +4566,7 @@ State
\noun default
\color inherit
\begin_inset Formula $node.needGradient\leftarrow true$
\begin_inset Formula $node.needsGradient\leftarrow true$
\end_inset
@ -4626,7 +4626,7 @@ State
\noun default
\color inherit
\begin_inset Formula $node.needGradient\leftarrow false$
\begin_inset Formula $node.needsGradient\leftarrow false$
\end_inset
@ -4730,14 +4730,14 @@ end{algorithmic}
\begin_layout Plain Layout
Update the
\begin_inset Formula $needGradient$
\begin_inset Formula $needsGradient$
\end_inset
flag recursively.
\begin_inset CommandInset label
LatexCommand label
name "alg:CN-NeedGradient"
name "alg:CN-needsGradient"
\end_inset

Просмотреть файл

@ -17,8 +17,8 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, expAvg) = [
W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, expAvgFactor = expAvg)
y = RectifiedLinear(bn)
@ -35,8 +35,8 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg) = [
b = LearnableParameter(outMap, 1, init=fixedValue, value=bValue)
sc = LearnableParameter(outMap, 1, init=fixedValue, value=scValue)
m = LearnableParameter(outMap, 1, init=fixedValue, value=0, needGradient=false)
isd = LearnableParameter(outMap, 1, init=fixedValue, value=0, needGradient=false)
m = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
isd = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
y = BatchNormalization(c, sc, b, m, isd, eval=false, spatial=true, expAvgFactor=expAvg, imageLayout=$imageLayout$)

Просмотреть файл

@ -37,14 +37,14 @@ DNN=[
rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
cMap2 = 32
#rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
#rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
#rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
rn2_1 = ResNetNode2Inc2(rn1_3, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
cMap3 = 64
#rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
#rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
#rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
rn3_1 = ResNetNode2Inc2(rn2_3, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)

Просмотреть файл

@ -52,7 +52,7 @@ DNN=[
rn1_18= ResNetNode2(rn1_17, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
cMap2 = 32
#rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
#rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
#rn2_1 = ResNetNode2Inc(rn1_18, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
rn2_1 = ResNetNode2Inc2(rn1_18, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
@ -74,7 +74,7 @@ DNN=[
rn2_18= ResNetNode2(rn2_17, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
cMap3 = 64
#rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
#rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
#rn3_1 = ResNetNode2Inc(rn2_18, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
rn3_1 = ResNetNode2Inc2(rn2_18, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)

Просмотреть файл

@ -11,8 +11,8 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg)
{
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
@ -83,8 +83,8 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, expAvg)
W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
b = Parameter(outDim, 1, init = fixedValue, value = bValue)
sc = Parameter(outDim, 1, init = fixedValue, value = scValue)
m = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
isd = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, expAvgFactor = expAvg)
y = RectifiedLinear(bn)

Просмотреть файл

@ -2,8 +2,8 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg)
{
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = expAvg, epsilon = 0.000000001, imageLayout = "cudnn")

Просмотреть файл

@ -45,14 +45,14 @@ DNN=[
rn1_3 = ResNetNode2A(rn1_2, cMap1, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
cMap2 = 128
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj64to128Filename$", needGradient = false)
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj64to128Filename$", learningRateMultiplier = 0)
rn2_1 = ResNetNode2AInc(rn1_3, cMap2, 576, 1152, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
rn2_3 = ResNetNode2A(rn2_2, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
rn2_4 = ResNetNode2A(rn2_3, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
cMap3 = 256
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj128to256Filename$", needGradient = false)
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj128to256Filename$", learningRateMultiplier = 0)
rn3_1 = ResNetNode2AInc(rn2_4, cMap3, 1152, 2304, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
rn3_3 = ResNetNode2A(rn3_2, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
@ -61,7 +61,7 @@ DNN=[
rn3_6 = ResNetNode2A(rn3_5, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
cMap4 = 512
rn4_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false)
rn4_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", learningRateMultiplier = 0)
rn4_1 = ResNetNode2AInc(rn3_6, cMap4, 2304, 4608, kW, kH, convWScale, convBValue, scValue, expAvg, rn4_1_Wproj)
rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, expAvg)
rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, expAvg)

Просмотреть файл

@ -14,8 +14,8 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
b = Parameter(outDim, 1, init = fixedValue, value = bValue)
sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
m = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
isd = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false)
y = RectifiedLinear(bn)
@ -46,8 +46,8 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, imageLayout = "cudnn")

Просмотреть файл

@ -259,7 +259,7 @@ void TestConfiguration(const ConfigParameters& configBase)
size_t cols = 0;
if (!IsParameter(paramsMap, configNode[2]))
cols = configNode[2];
bool needGradient = false;
bool learningRateMultiplier = 0;
bool init = false;
ConfigArray initData;
@ -268,8 +268,8 @@ void TestConfiguration(const ConfigParameters& configBase)
{
ConfigParameters configParam = configNode[i];
// TODO: update to learningRateMultiplier
if (configParam.Exists("needGradient")) // TODO: should this be a test for 'true' rather than Exists()?
needGradient = true;
if (configParam.Exists("learningRateMultiplier")) // TODO: should this be a test for 'true' rather than Exists()?
needsGradient = (float)configParam("learningRateMultiplier") > 0? true : false;
else if (configParam.Exists("init"))
{
init = true;

Просмотреть файл

@ -328,7 +328,6 @@ public:
void AddFeatureNode(ComputationNodeBasePtr featureNode);
void RemoveFeatureNode(ComputationNodeBasePtr featureNode);
void SetLearnableNodesBelowLearningRateMultiplier(const float learningRateMultiplier, const ComputationNodeBasePtr& rootNode = nullptr);
void SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode = nullptr); // for backward compatibility
void SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr);
// -----------------------------------------------------------------------

Просмотреть файл

@ -323,32 +323,6 @@ void ComputationNetwork::SetLearnableNodesBelowLearningRateMultiplier(const floa
}
}
// sets m_learningRateMultiplier in all LearnableParameters feeding into the passed rootNode
// Called from MEL
// TODO: This function should be implemented using teh above. No code dup please!
void ComputationNetwork::SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode)
{
// find nodes from all available nodes
if (rootNode == nullptr)
{
for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++)
{
ComputationNodeBasePtr node = nodeIter->second;
if (node->OperationName() == OperationNameOf(LearnableParameter))
node->SetLearningRateMultiplier((float)needGradient);
}
}
else
{
// for calculating a specific node
for (const auto& node : GetEvalOrder(rootNode))
{
if (node->OperationName() == OperationNameOf(LearnableParameter))
node->SetLearningRateMultiplier((float)needGradient);
}
}
}
void ComputationNetwork::SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */)
{
vector<ComputationNodeBasePtr> nodes;

Просмотреть файл

@ -288,7 +288,7 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
{
for (auto nodeIter = m_nestedNodes.rbegin(); nodeIter != m_nestedNodes.rend(); ++nodeIter)
{
if ((*nodeIter)->NeedGradient())
if ((*nodeIter)->NeedsGradient())
(*nodeIter)->ReleaseMatricesAfterBackprop(matrixPool);
}
}
@ -828,7 +828,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
// PAR mode: we can allocate and immediately deallocate one by one
n->AllocateGradientMatricesForInputs(m_matrixPool);
// Root node's information will be used and should not be shared with others, also it's small (1x1)
if ((n != trainRootNode) && n->NeedGradient())
if ((n != trainRootNode) && n->NeedsGradient())
n->ReleaseMatricesAfterBackprop(m_matrixPool);
}
}

Просмотреть файл

@ -585,7 +585,7 @@ public:
fprintf(stderr, "Node --> %ls = %ls\n", NodeName().c_str(), OperationName().c_str()), fflush(stderr);
}
bool NeedGradient() const { return m_needsGradient; }
bool NeedsGradient() const { return m_needsGradient; }
void SetLearningRateMultiplier(float f)
{
@ -1382,7 +1382,7 @@ public:
{
for (int i = 0; i < m_inputs.size(); i++)
{
if (m_inputs[i]->NeedGradient())
if (m_inputs[i]->NeedsGradient())
m_inputs[i]->RequestMatricesBeforeBackprop(matrixPool);
}
}

Просмотреть файл

@ -65,7 +65,11 @@ public:
// TODO: Change dimensions to take a generic tensor instead. That will be a (minor) breaking change that will require fix-ups when converting from NDL to BrainScript.
AttachInputs(configp, this->GetExpectedNumInputs());
// parameters[rows, [cols=1]] plus other optional parameters (learningRateMultiplier=[1|0|float], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float])
SetLearningRateMultiplier(configp->Get(L"learningRateMultiplier"));
if (configp->Exists(L"learningRateMultiplier"))
SetLearningRateMultiplier(configp->Get(L"learningRateMultiplier"));
else if (configp->Exists(L"needsGradient") || configp->Exists(L"needGradient") || configp->Exists(L"computeGradient"))
InvalidArgument("needsGradient|needGradient|computeGradient are not supported in BrainScript. Use learningRateMultiplier instead.");
wstring initString = configp->Get(L"init");
if (initString == L"fixedValue")
Value().SetValue((ElemType) configp->Get(L"value"));
@ -263,7 +267,7 @@ public:
char str[4096];
sprintf(str, "[%lu,%lu] ", GetAsMatrixNumRows(), GetAsMatrixNumCols());
fstream << string(str);
sprintf(str, "learningRateMultiplier=%f NeedGradient=%s", m_learningRateMultiplier, m_learningRateMultiplier>0 ? "true" : "false"); // TODO: update NDL to accept a better matching name as well
sprintf(str, "learningRateMultiplier=%f NeedsGradient=%s", m_learningRateMultiplier, m_learningRateMultiplier>0 ? "true" : "false"); // TODO: update NDL to accept a better matching name as well
fstream << string(str);
}

Просмотреть файл

@ -366,7 +366,7 @@ public:
virtual void AllocateGradientMatricesForInputs(MatrixPool& matrixPool) override
{
// this is a special handling case. We need to allocate sparse matrix directly instead of from pool.
if (Input(0)->NeedGradient() && Input(1)->Value().GetMatrixType() == SPARSE)
if (Input(0)->NeedsGradient() && Input(1)->Value().GetMatrixType() == SPARSE)
{
Input(0)->CreateGradientMatrixIfNull();
Input(0)->Gradient().SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);

Просмотреть файл

@ -436,7 +436,7 @@ public:
virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
{
InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage.");
InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
}
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
@ -506,10 +506,6 @@ public:
InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
}
// TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
Input(1)->SetLearningRateMultiplier(0); // prevent learning
Input(2)->SetLearningRateMultiplier(0);
SetDims(Input(0));
}
};
@ -540,7 +536,7 @@ public:
virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
{
InvalidArgument("PerDimMeanVarDeNormalizationNode should only be called in the evaluation stage.");
InvalidArgument("PerDimMeanVarDeNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
}
// (feature-mean).*InvStdDev
@ -618,10 +614,6 @@ public:
InvalidArgument("PerDimMeanVarDeNormalizationNode: All inputs should have same sample layout.");
}
// TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
Input(1)->SetLearningRateMultiplier(0); // prevent learning
Input(2)->SetLearningRateMultiplier(0);
SetDims(Input(0));
}
};

Просмотреть файл

@ -519,10 +519,7 @@ public:
private:
void FindBestForwardAlgo(const CuDnnTensor4D& inT, const CuDnnFilter& filtT, const CuDnnConvolutionDescriptor& convDesc, const CuDnnTensor4D& outT)
{
// Need to re-run auto-tuner in case batch size has been changed.
// We assume no other dimensions of tensors can change so we don't check it.
// REVIEW alexeyk: is this a safe assumption? Can convolution configuration change in runtime?
if (m_fwdAlgo.Algo.status == CUDNN_STATUS_SUCCESS && inT.n() == m_fwdAlgo.CurMBSize && outT.n() == m_fwdAlgo.CurMBSize)
if (!m_fwdAlgo.NeedAutotuning(inT, outT))
return;
const int MaxAlgoCount = 10;
int calgo = 0;
@ -543,7 +540,7 @@ private:
void FindBestBackwardDataAlgo(const CuDnnFilter& filtT, const CuDnnTensor4D& srcGradT, const CuDnnConvolutionDescriptor& convDesc, const CuDnnTensor4D& gradT)
{
if (m_backDataAlgo.Algo.status == CUDNN_STATUS_SUCCESS && srcGradT.n() == m_backDataAlgo.CurMBSize && gradT.n() == m_backDataAlgo.CurMBSize)
if (!m_backDataAlgo.NeedAutotuning(srcGradT, gradT))
return;
const int MaxAlgoCount = 10;
int calgo = 0;
@ -564,7 +561,7 @@ private:
void FindBestBackwardFilterAlgo(const CuDnnTensor4D& inT, const CuDnnTensor4D& srcGradT, const CuDnnConvolutionDescriptor& convDesc, const CuDnnFilter& filtT)
{
if (m_backFiltAlgo.Algo.status == CUDNN_STATUS_SUCCESS && inT.n() == m_backFiltAlgo.CurMBSize && srcGradT.n() == m_backFiltAlgo.CurMBSize)
if (!m_backFiltAlgo.NeedAutotuning(inT, srcGradT))
return;
const int MaxAlgoCount = 10;
int calgo = 0;
@ -595,6 +592,16 @@ private:
// Current mini-batch size, needed for re-computing statistics in auto-tuner.
size_t CurMBSize;
T Algo;
bool NeedAutotuning(const CuDnnTensor4D& t1, const CuDnnTensor4D& t2)
{
// Need to re-run auto-tuner in case minibatch size is increased.
// If minibatch size is decreased we assume that previously selected algorithm requires less or the same amount of workspace.
// This is done to avoid re-running auto-tuner every time in case minibatch size changes frequently (e.g. when distributed reading is enabled).
// REVIEW alexeyk: potentially, this might cause some perf issues if better (faster) algo can be selected for a smaller mininbatch.
// We assume no other dimensions of tensors can change so we don't check it.
return (Algo.status != CUDNN_STATUS_SUCCESS || t1.n() > CurMBSize || t2.n() > CurMBSize);
}
};
using C = Consts<ElemType>;