Merge branch 'dongyu/needsGradientChange' of https://github.com/Microsoft/CNTK into fseide/s2s
Conflicts: Source/CNTK/tests.cpp Source/ComputationNetworkLib/ComputationNetwork.h Source/ComputationNetworkLib/ComputationNetworkEditing.cpp Source/ComputationNetworkLib/InputAndParamNodes.h Source/ComputationNetworkLib/PreComputeNodes.h
This commit is contained in:
Коммит
32004edfa2
|
@ -1652,7 +1652,7 @@ LearnableParameter(row, cols,
|
|||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
Parameter(row, cols, {needGradient=true|false,
|
||||
Parameter(row, cols, {needsGradient=true|false,
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
@ -1680,7 +1680,7 @@ cols - number of columns in the parameter, defaults to 1.
|
|||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
needGradient- [named optional] determines whether the parameter should be
|
||||
needsGradient- [named optional] determines whether the parameter should be
|
||||
updated by the training algorithm.
|
||||
Defaults is true.
|
||||
\end_layout
|
||||
|
@ -4221,7 +4221,7 @@ features=InputValue [784,32]
|
|||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
L1.BFF.B=LearnableParameter [256,1] NeedGradient=true
|
||||
L1.BFF.B=LearnableParameter [256,1] NeedsGradient=true
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
@ -4266,7 +4266,7 @@ L1.BFF.FF.T=Times ( L1.BFF.W , normInput )
|
|||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
L1.BFF.W=LearnableParameter [256,784] NeedGradient=true
|
||||
L1.BFF.W=LearnableParameter [256,784] NeedsGradient=true
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
|
|
@ -4023,7 +4023,7 @@ In many cases, not all the gradients need to be computed.
|
|||
updated and thus it is unnecessary to compute the gradients with regard
|
||||
to these parameters.
|
||||
We can reduce the gradient computation by keeping a
|
||||
\begin_inset Formula $needGradient$
|
||||
\begin_inset Formula $needsGradient$
|
||||
\end_inset
|
||||
|
||||
flag for each node.
|
||||
|
@ -4032,7 +4032,7 @@ In many cases, not all the gradients need to be computed.
|
|||
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand ref
|
||||
reference "alg:CN-NeedGradient"
|
||||
reference "alg:CN-needsGradient"
|
||||
|
||||
\end_inset
|
||||
|
||||
|
@ -4047,7 +4047,7 @@ reference "alg:CN-ForwardOrder-DAG"
|
|||
and
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand ref
|
||||
reference "alg:CN-NeedGradient"
|
||||
reference "alg:CN-needsGradient"
|
||||
|
||||
\end_inset
|
||||
|
||||
|
@ -4101,7 +4101,7 @@ status collapsed
|
|||
|
||||
\end_inset
|
||||
|
||||
UpdateNeedGradientFlag
|
||||
UpdateneedsGradientFlag
|
||||
\begin_inset ERT
|
||||
status collapsed
|
||||
|
||||
|
@ -4427,7 +4427,7 @@ State
|
|||
|
||||
call
|
||||
\noun on
|
||||
UpdateNeedGradientFlag
|
||||
UpdateneedsGradientFlag
|
||||
\noun default
|
||||
(
|
||||
\begin_inset Formula $c$
|
||||
|
@ -4511,7 +4511,7 @@ status open
|
|||
\end_inset
|
||||
|
||||
|
||||
\begin_inset Formula $node.AnyChildNeedGradient()$
|
||||
\begin_inset Formula $node.AnyChildneedsGradient()$
|
||||
\end_inset
|
||||
|
||||
|
||||
|
@ -4566,7 +4566,7 @@ State
|
|||
\noun default
|
||||
\color inherit
|
||||
|
||||
\begin_inset Formula $node.needGradient\leftarrow true$
|
||||
\begin_inset Formula $node.needsGradient\leftarrow true$
|
||||
\end_inset
|
||||
|
||||
|
||||
|
@ -4626,7 +4626,7 @@ State
|
|||
\noun default
|
||||
\color inherit
|
||||
|
||||
\begin_inset Formula $node.needGradient\leftarrow false$
|
||||
\begin_inset Formula $node.needsGradient\leftarrow false$
|
||||
\end_inset
|
||||
|
||||
|
||||
|
@ -4730,14 +4730,14 @@ end{algorithmic}
|
|||
|
||||
\begin_layout Plain Layout
|
||||
Update the
|
||||
\begin_inset Formula $needGradient$
|
||||
\begin_inset Formula $needsGradient$
|
||||
\end_inset
|
||||
|
||||
flag recursively.
|
||||
|
||||
\begin_inset CommandInset label
|
||||
LatexCommand label
|
||||
name "alg:CN-NeedGradient"
|
||||
name "alg:CN-needsGradient"
|
||||
|
||||
\end_inset
|
||||
|
||||
|
|
|
@ -17,8 +17,8 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, expAvg) = [
|
|||
W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
|
||||
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, expAvgFactor = expAvg)
|
||||
y = RectifiedLinear(bn)
|
||||
|
@ -35,8 +35,8 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
|
|||
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg) = [
|
||||
b = LearnableParameter(outMap, 1, init=fixedValue, value=bValue)
|
||||
sc = LearnableParameter(outMap, 1, init=fixedValue, value=scValue)
|
||||
m = LearnableParameter(outMap, 1, init=fixedValue, value=0, needGradient=false)
|
||||
isd = LearnableParameter(outMap, 1, init=fixedValue, value=0, needGradient=false)
|
||||
m = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
|
||||
isd = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, isd, eval=false, spatial=true, expAvgFactor=expAvg, imageLayout=$imageLayout$)
|
||||
|
|
|
@ -37,14 +37,14 @@ DNN=[
|
|||
rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
||||
cMap2 = 32
|
||||
#rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
|
||||
#rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
|
||||
#rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
|
||||
rn2_1 = ResNetNode2Inc2(rn1_3, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
|
||||
rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
||||
cMap3 = 64
|
||||
#rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
|
||||
#rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
|
||||
#rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
|
||||
rn3_1 = ResNetNode2Inc2(rn2_3, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
|
||||
rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
|
|
@ -52,7 +52,7 @@ DNN=[
|
|||
rn1_18= ResNetNode2(rn1_17, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
||||
cMap2 = 32
|
||||
#rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
|
||||
#rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
|
||||
#rn2_1 = ResNetNode2Inc(rn1_18, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
|
||||
rn2_1 = ResNetNode2Inc2(rn1_18, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
|
||||
rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
@ -74,7 +74,7 @@ DNN=[
|
|||
rn2_18= ResNetNode2(rn2_17, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
||||
cMap3 = 64
|
||||
#rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
|
||||
#rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
|
||||
#rn3_1 = ResNetNode2Inc(rn2_18, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
|
||||
rn3_1 = ResNetNode2Inc2(rn2_18, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
|
||||
rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
|
|
@ -11,8 +11,8 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg)
|
|||
{
|
||||
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
|
||||
m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
|
||||
y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
|
||||
|
@ -83,8 +83,8 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, expAvg)
|
|||
W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
|
||||
b = Parameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(outDim, 1, init = fixedValue, value = scValue)
|
||||
m = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
isd = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, expAvgFactor = expAvg)
|
||||
y = RectifiedLinear(bn)
|
||||
|
|
|
@ -2,8 +2,8 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg)
|
|||
{
|
||||
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
|
||||
m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
|
||||
y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = expAvg, epsilon = 0.000000001, imageLayout = "cudnn")
|
||||
|
|
|
@ -45,14 +45,14 @@ DNN=[
|
|||
rn1_3 = ResNetNode2A(rn1_2, cMap1, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
||||
cMap2 = 128
|
||||
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj64to128Filename$", needGradient = false)
|
||||
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj64to128Filename$", learningRateMultiplier = 0)
|
||||
rn2_1 = ResNetNode2AInc(rn1_3, cMap2, 576, 1152, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
|
||||
rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
rn2_3 = ResNetNode2A(rn2_2, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
rn2_4 = ResNetNode2A(rn2_3, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
||||
cMap3 = 256
|
||||
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj128to256Filename$", needGradient = false)
|
||||
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj128to256Filename$", learningRateMultiplier = 0)
|
||||
rn3_1 = ResNetNode2AInc(rn2_4, cMap3, 1152, 2304, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
|
||||
rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
rn3_3 = ResNetNode2A(rn3_2, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
@ -61,7 +61,7 @@ DNN=[
|
|||
rn3_6 = ResNetNode2A(rn3_5, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
||||
cMap4 = 512
|
||||
rn4_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false)
|
||||
rn4_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", learningRateMultiplier = 0)
|
||||
rn4_1 = ResNetNode2AInc(rn3_6, cMap4, 2304, 4608, kW, kH, convWScale, convBValue, scValue, expAvg, rn4_1_Wproj)
|
||||
rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, expAvg)
|
||||
|
|
|
@ -14,8 +14,8 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
|
|||
W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
|
||||
b = Parameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
|
||||
m = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
isd = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false)
|
||||
y = RectifiedLinear(bn)
|
||||
|
@ -46,8 +46,8 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
|
|||
W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
|
||||
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
|
||||
m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
|
||||
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
|
||||
bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, imageLayout = "cudnn")
|
||||
|
|
|
@ -259,7 +259,7 @@ void TestConfiguration(const ConfigParameters& configBase)
|
|||
size_t cols = 0;
|
||||
if (!IsParameter(paramsMap, configNode[2]))
|
||||
cols = configNode[2];
|
||||
bool needGradient = false;
|
||||
bool learningRateMultiplier = 0;
|
||||
bool init = false;
|
||||
ConfigArray initData;
|
||||
|
||||
|
@ -268,8 +268,8 @@ void TestConfiguration(const ConfigParameters& configBase)
|
|||
{
|
||||
ConfigParameters configParam = configNode[i];
|
||||
// TODO: update to learningRateMultiplier
|
||||
if (configParam.Exists("needGradient")) // TODO: should this be a test for 'true' rather than Exists()?
|
||||
needGradient = true;
|
||||
if (configParam.Exists("learningRateMultiplier")) // TODO: should this be a test for 'true' rather than Exists()?
|
||||
needsGradient = (float)configParam("learningRateMultiplier") > 0? true : false;
|
||||
else if (configParam.Exists("init"))
|
||||
{
|
||||
init = true;
|
||||
|
|
|
@ -328,7 +328,6 @@ public:
|
|||
void AddFeatureNode(ComputationNodeBasePtr featureNode);
|
||||
void RemoveFeatureNode(ComputationNodeBasePtr featureNode);
|
||||
void SetLearnableNodesBelowLearningRateMultiplier(const float learningRateMultiplier, const ComputationNodeBasePtr& rootNode = nullptr);
|
||||
void SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode = nullptr); // for backward compatibility
|
||||
void SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr);
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
|
|
@ -323,32 +323,6 @@ void ComputationNetwork::SetLearnableNodesBelowLearningRateMultiplier(const floa
|
|||
}
|
||||
}
|
||||
|
||||
// sets m_learningRateMultiplier in all LearnableParameters feeding into the passed rootNode
|
||||
// Called from MEL
|
||||
// TODO: This function should be implemented using teh above. No code dup please!
|
||||
void ComputationNetwork::SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode)
|
||||
{
|
||||
// find nodes from all available nodes
|
||||
if (rootNode == nullptr)
|
||||
{
|
||||
for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++)
|
||||
{
|
||||
ComputationNodeBasePtr node = nodeIter->second;
|
||||
if (node->OperationName() == OperationNameOf(LearnableParameter))
|
||||
node->SetLearningRateMultiplier((float)needGradient);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// for calculating a specific node
|
||||
for (const auto& node : GetEvalOrder(rootNode))
|
||||
{
|
||||
if (node->OperationName() == OperationNameOf(LearnableParameter))
|
||||
node->SetLearningRateMultiplier((float)needGradient);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ComputationNetwork::SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */)
|
||||
{
|
||||
vector<ComputationNodeBasePtr> nodes;
|
||||
|
|
|
@ -288,7 +288,7 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
|
|||
{
|
||||
for (auto nodeIter = m_nestedNodes.rbegin(); nodeIter != m_nestedNodes.rend(); ++nodeIter)
|
||||
{
|
||||
if ((*nodeIter)->NeedGradient())
|
||||
if ((*nodeIter)->NeedsGradient())
|
||||
(*nodeIter)->ReleaseMatricesAfterBackprop(matrixPool);
|
||||
}
|
||||
}
|
||||
|
@ -828,7 +828,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
|
|||
// PAR mode: we can allocate and immediately deallocate one by one
|
||||
n->AllocateGradientMatricesForInputs(m_matrixPool);
|
||||
// Root node's information will be used and should not be shared with others, also it's small (1x1)
|
||||
if ((n != trainRootNode) && n->NeedGradient())
|
||||
if ((n != trainRootNode) && n->NeedsGradient())
|
||||
n->ReleaseMatricesAfterBackprop(m_matrixPool);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -585,7 +585,7 @@ public:
|
|||
fprintf(stderr, "Node --> %ls = %ls\n", NodeName().c_str(), OperationName().c_str()), fflush(stderr);
|
||||
}
|
||||
|
||||
bool NeedGradient() const { return m_needsGradient; }
|
||||
bool NeedsGradient() const { return m_needsGradient; }
|
||||
|
||||
void SetLearningRateMultiplier(float f)
|
||||
{
|
||||
|
@ -1382,7 +1382,7 @@ public:
|
|||
{
|
||||
for (int i = 0; i < m_inputs.size(); i++)
|
||||
{
|
||||
if (m_inputs[i]->NeedGradient())
|
||||
if (m_inputs[i]->NeedsGradient())
|
||||
m_inputs[i]->RequestMatricesBeforeBackprop(matrixPool);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -65,7 +65,11 @@ public:
|
|||
// TODO: Change dimensions to take a generic tensor instead. That will be a (minor) breaking change that will require fix-ups when converting from NDL to BrainScript.
|
||||
AttachInputs(configp, this->GetExpectedNumInputs());
|
||||
// parameters[rows, [cols=1]] plus other optional parameters (learningRateMultiplier=[1|0|float], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float])
|
||||
SetLearningRateMultiplier(configp->Get(L"learningRateMultiplier"));
|
||||
if (configp->Exists(L"learningRateMultiplier"))
|
||||
SetLearningRateMultiplier(configp->Get(L"learningRateMultiplier"));
|
||||
else if (configp->Exists(L"needsGradient") || configp->Exists(L"needGradient") || configp->Exists(L"computeGradient"))
|
||||
InvalidArgument("needsGradient|needGradient|computeGradient are not supported in BrainScript. Use learningRateMultiplier instead.");
|
||||
|
||||
wstring initString = configp->Get(L"init");
|
||||
if (initString == L"fixedValue")
|
||||
Value().SetValue((ElemType) configp->Get(L"value"));
|
||||
|
@ -263,7 +267,7 @@ public:
|
|||
char str[4096];
|
||||
sprintf(str, "[%lu,%lu] ", GetAsMatrixNumRows(), GetAsMatrixNumCols());
|
||||
fstream << string(str);
|
||||
sprintf(str, "learningRateMultiplier=%f NeedGradient=%s", m_learningRateMultiplier, m_learningRateMultiplier>0 ? "true" : "false"); // TODO: update NDL to accept a better matching name as well
|
||||
sprintf(str, "learningRateMultiplier=%f NeedsGradient=%s", m_learningRateMultiplier, m_learningRateMultiplier>0 ? "true" : "false"); // TODO: update NDL to accept a better matching name as well
|
||||
fstream << string(str);
|
||||
}
|
||||
|
||||
|
|
|
@ -366,7 +366,7 @@ public:
|
|||
virtual void AllocateGradientMatricesForInputs(MatrixPool& matrixPool) override
|
||||
{
|
||||
// this is a special handling case. We need to allocate sparse matrix directly instead of from pool.
|
||||
if (Input(0)->NeedGradient() && Input(1)->Value().GetMatrixType() == SPARSE)
|
||||
if (Input(0)->NeedsGradient() && Input(1)->Value().GetMatrixType() == SPARSE)
|
||||
{
|
||||
Input(0)->CreateGradientMatrixIfNull();
|
||||
Input(0)->Gradient().SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
|
||||
|
|
|
@ -436,7 +436,7 @@ public:
|
|||
|
||||
virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
|
||||
{
|
||||
InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage.");
|
||||
InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
|
||||
}
|
||||
|
||||
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
|
||||
|
@ -506,10 +506,6 @@ public:
|
|||
InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
|
||||
}
|
||||
|
||||
// TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
|
||||
Input(1)->SetLearningRateMultiplier(0); // prevent learning
|
||||
Input(2)->SetLearningRateMultiplier(0);
|
||||
|
||||
SetDims(Input(0));
|
||||
}
|
||||
};
|
||||
|
@ -540,7 +536,7 @@ public:
|
|||
|
||||
virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
|
||||
{
|
||||
InvalidArgument("PerDimMeanVarDeNormalizationNode should only be called in the evaluation stage.");
|
||||
InvalidArgument("PerDimMeanVarDeNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
|
||||
}
|
||||
|
||||
// (feature-mean).*InvStdDev
|
||||
|
@ -618,10 +614,6 @@ public:
|
|||
InvalidArgument("PerDimMeanVarDeNormalizationNode: All inputs should have same sample layout.");
|
||||
}
|
||||
|
||||
// TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
|
||||
Input(1)->SetLearningRateMultiplier(0); // prevent learning
|
||||
Input(2)->SetLearningRateMultiplier(0);
|
||||
|
||||
SetDims(Input(0));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -519,10 +519,7 @@ public:
|
|||
private:
|
||||
void FindBestForwardAlgo(const CuDnnTensor4D& inT, const CuDnnFilter& filtT, const CuDnnConvolutionDescriptor& convDesc, const CuDnnTensor4D& outT)
|
||||
{
|
||||
// Need to re-run auto-tuner in case batch size has been changed.
|
||||
// We assume no other dimensions of tensors can change so we don't check it.
|
||||
// REVIEW alexeyk: is this a safe assumption? Can convolution configuration change in runtime?
|
||||
if (m_fwdAlgo.Algo.status == CUDNN_STATUS_SUCCESS && inT.n() == m_fwdAlgo.CurMBSize && outT.n() == m_fwdAlgo.CurMBSize)
|
||||
if (!m_fwdAlgo.NeedAutotuning(inT, outT))
|
||||
return;
|
||||
const int MaxAlgoCount = 10;
|
||||
int calgo = 0;
|
||||
|
@ -543,7 +540,7 @@ private:
|
|||
|
||||
void FindBestBackwardDataAlgo(const CuDnnFilter& filtT, const CuDnnTensor4D& srcGradT, const CuDnnConvolutionDescriptor& convDesc, const CuDnnTensor4D& gradT)
|
||||
{
|
||||
if (m_backDataAlgo.Algo.status == CUDNN_STATUS_SUCCESS && srcGradT.n() == m_backDataAlgo.CurMBSize && gradT.n() == m_backDataAlgo.CurMBSize)
|
||||
if (!m_backDataAlgo.NeedAutotuning(srcGradT, gradT))
|
||||
return;
|
||||
const int MaxAlgoCount = 10;
|
||||
int calgo = 0;
|
||||
|
@ -564,7 +561,7 @@ private:
|
|||
|
||||
void FindBestBackwardFilterAlgo(const CuDnnTensor4D& inT, const CuDnnTensor4D& srcGradT, const CuDnnConvolutionDescriptor& convDesc, const CuDnnFilter& filtT)
|
||||
{
|
||||
if (m_backFiltAlgo.Algo.status == CUDNN_STATUS_SUCCESS && inT.n() == m_backFiltAlgo.CurMBSize && srcGradT.n() == m_backFiltAlgo.CurMBSize)
|
||||
if (!m_backFiltAlgo.NeedAutotuning(inT, srcGradT))
|
||||
return;
|
||||
const int MaxAlgoCount = 10;
|
||||
int calgo = 0;
|
||||
|
@ -595,6 +592,16 @@ private:
|
|||
// Current mini-batch size, needed for re-computing statistics in auto-tuner.
|
||||
size_t CurMBSize;
|
||||
T Algo;
|
||||
|
||||
bool NeedAutotuning(const CuDnnTensor4D& t1, const CuDnnTensor4D& t2)
|
||||
{
|
||||
// Need to re-run auto-tuner in case minibatch size is increased.
|
||||
// If minibatch size is decreased we assume that previously selected algorithm requires less or the same amount of workspace.
|
||||
// This is done to avoid re-running auto-tuner every time in case minibatch size changes frequently (e.g. when distributed reading is enabled).
|
||||
// REVIEW alexeyk: potentially, this might cause some perf issues if better (faster) algo can be selected for a smaller mininbatch.
|
||||
// We assume no other dimensions of tensors can change so we don't check it.
|
||||
return (Algo.status != CUDNN_STATUS_SUCCESS || t1.n() > CurMBSize || t2.n() > CurMBSize);
|
||||
}
|
||||
};
|
||||
|
||||
using C = Consts<ElemType>;
|
||||
|
|
Загрузка…
Ссылка в новой задаче