Address CR comments
This commit is contained in:
Родитель
6aea378f90
Коммит
e1a9cabbde
|
@ -26,9 +26,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
|
|||
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
bn = BatchNormalization(t, sc, b, m, v, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
|
@ -72,10 +72,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
|
|||
b = LearnableParameter(outMap, 1, init=fixedValue, value=bValue)
|
||||
sc = LearnableParameter(outMap, 1, init=fixedValue, value=scValue)
|
||||
m = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
|
||||
var = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
|
||||
v = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, v, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
|
||||
]
|
||||
|
||||
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
|
||||
|
|
|
@ -22,13 +22,13 @@ DNNImageSigmoidLayer (inW, inH, inC, outDim, x, parmScale) = [
|
|||
# ReLU layer with batch normalization
|
||||
# TODO: rename to DNN-
|
||||
DnnBNReLULayer (inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
|
||||
W = Parameter (outDim, inDim, init = "gaussian", initValueScale = wScale, initOnCPUOnly=true)
|
||||
b = Parameter (outDim, 1, init = "fixedValue", value = bValue)
|
||||
sc = Parameter (outDim, 1, init = "fixedValue", value = scValue)
|
||||
m = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
|
||||
var = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
|
||||
W = Parameter (outDim, inDim, init = "gaussian", initValueScale = wScale, initOnCPUOnly=true)
|
||||
b = Parameter (outDim, 1, init = "fixedValue", value = bValue)
|
||||
sc = Parameter (outDim, 1, init = "fixedValue", value = scValue)
|
||||
m = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
|
||||
v = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x) # TODO: W * x
|
||||
bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
bn = BatchNormalization(t, sc, b, m, v, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
y = RectifiedLinear(bn)
|
||||
].y
|
||||
|
||||
|
@ -58,13 +58,13 @@ ConvNDReLULayer (inp, kW, kH, inMap, inWCount, outMap, hStride, vStride, wScale,
|
|||
].out
|
||||
|
||||
ConvBNLayerW (W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst) = [ # TODO: delete if not needed
|
||||
b = Parameter(outMap, 1, init="fixedValue", value=bValue)
|
||||
sc = Parameter(outMap, 1, init="fixedValue", value=scValue)
|
||||
m = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
|
||||
var = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
|
||||
b = Parameter(outMap, 1, init="fixedValue", value=bValue)
|
||||
sc = Parameter(outMap, 1, init="fixedValue", value=scValue)
|
||||
m = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
|
||||
v = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true /* , imageLayout=$imageLayout$*/)
|
||||
y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
|
||||
y = BatchNormalization(c, sc, b, m, v, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
|
||||
].y
|
||||
|
||||
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
|
||||
|
|
|
@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
|
|||
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
|
||||
]
|
||||
|
||||
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
|
@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
|
|||
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
|
||||
]
|
||||
|
||||
ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
|
||||
|
@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
|
|||
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
|
@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
|
|||
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
|
|
|
@ -8,9 +8,9 @@ BN(inp, mapCount, bValue, scValue, bnTimeConst)
|
|||
b = Parameter(mapCount, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(mapCount, 1, init = fixedValue, value = scValue)
|
||||
m = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
y = BatchNormalization(inp, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
|
||||
y = BatchNormalization(inp, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
|
||||
]
|
||||
|
||||
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
|
||||
|
|
|
@ -15,9 +15,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
|
|||
b = Parameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
|
||||
m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, var, spatial = false)
|
||||
bn = BatchNormalization(t, sc, b, m, v, spatial = false)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
|
@ -47,9 +47,9 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
|
|||
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
|
||||
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
|
||||
bn = BatchNormalization(c, sc, b, m, var, spatial = true, imageLayout = "cudnn")
|
||||
bn = BatchNormalization(c, sc, b, m, v, spatial = true, imageLayout = "cudnn")
|
||||
y = RectifiedLinear(bn);
|
||||
]
|
||||
|
|
|
@ -1547,7 +1547,7 @@ template class DropoutNode<double>;
|
|||
//
|
||||
// m = mean(input)
|
||||
// var = variance(input)
|
||||
// input_norm = (input - mean) / sqrt(var)
|
||||
// input_norm = (input - mean) / sqrt(epsilon + var)
|
||||
// output = gamma * input_norm + beta
|
||||
//
|
||||
// where gamma and beta are trainable parameters(represented as LearnableParameter).
|
||||
|
@ -1570,7 +1570,7 @@ template class DropoutNode<double>;
|
|||
// * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch.
|
||||
// Value 0 (default) means no blending will happen and only the current minibatch statistics will be used.
|
||||
// Value 1#INF (infinity) means only running mean / var will be used(this is used, for example, in evaluation phase).
|
||||
// * epsilon is a conditioner constant used in computing inverted standard deviation
|
||||
// * epsilon is a conditioner constant used in computing inverse standard deviation
|
||||
// * useCntkEngine is a Boolean flag that specifies which batch normalization implementation to use: CNTK or cuDNN-based.
|
||||
// * imageLayout is the image layout. Only cudnn is supported at present.
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -1584,14 +1584,14 @@ public:
|
|||
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name) :
|
||||
Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
|
||||
m_samplesSeen(0), m_imageLayoutKind(ImageLayoutKind::CHW),
|
||||
m_convertRunningVariance(false)
|
||||
m_convertRunningVariancePending(false)
|
||||
{
|
||||
}
|
||||
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant,
|
||||
double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind) :
|
||||
Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
|
||||
m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_samplesSeen(0),
|
||||
m_convertRunningVariance(false)
|
||||
m_convertRunningVariancePending(false)
|
||||
{
|
||||
}
|
||||
BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
|
||||
|
@ -1627,7 +1627,9 @@ public:
|
|||
fstream >> m_normTimeConst;
|
||||
fstream >> m_blendTimeConst;
|
||||
fstream >> m_imageLayoutKind;
|
||||
#ifdef _DEBUG
|
||||
fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str());
|
||||
#endif
|
||||
fstream >> mbCount;
|
||||
m_samplesSeen = mbCount;
|
||||
fstream >> m_epsilon;
|
||||
|
@ -1666,7 +1668,9 @@ public:
|
|||
{
|
||||
size_t mbCount;
|
||||
fstream >> m_imageLayoutKind;
|
||||
#ifdef _DEBUG
|
||||
fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str());
|
||||
#endif
|
||||
fstream >> mbCount;
|
||||
m_samplesSeen = mbCount;
|
||||
}
|
||||
|
@ -1681,8 +1685,9 @@ public:
|
|||
{
|
||||
// Prior to CNTK_MODEL_VERSION_12, running inverse standard
|
||||
// deviation was stored in Input 4. Now variance is used.
|
||||
// We (approximately) convert it during validation later.
|
||||
m_convertRunningVariance = true;
|
||||
// We (approximately) convert it during validation later
|
||||
// (and then clear the flag).
|
||||
m_convertRunningVariancePending = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1713,15 +1718,13 @@ private: // time-constant conversions
|
|||
// in inference mode, only use long-term mean and do not update running estimates
|
||||
if (!Environment().IsTraining())
|
||||
{
|
||||
assert(0 < m_samplesSeen); // something must have been trained
|
||||
if (m_samplesSeen == 0)
|
||||
RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str());
|
||||
return 0; // (m_normTimeConst == infinity) no new contribution from current minibatch
|
||||
}
|
||||
|
||||
// Initialization case: only use current minibatch.
|
||||
if (m_samplesSeen == 0)
|
||||
{
|
||||
return 1.0;
|
||||
}
|
||||
if (m_samplesSeen == 0) return 1.0;
|
||||
|
||||
double numSamples = (double)GetMBLayout()->GetActualNumSamples();
|
||||
|
||||
|
@ -1747,15 +1750,13 @@ private: // time-constant conversions
|
|||
// in inference mode, only use long-term mean and do not update running estimates
|
||||
if (!Environment().IsTraining())
|
||||
{
|
||||
assert(0 < m_samplesSeen); // something must have been trained
|
||||
return 1.0; // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate
|
||||
if (m_samplesSeen == 0)
|
||||
RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str());
|
||||
return 1.0; // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate
|
||||
}
|
||||
|
||||
// Initialization case: only use current minibatch.
|
||||
if (m_samplesSeen == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
if (m_samplesSeen == 0) return 0;
|
||||
|
||||
// convert to blend factor (= weight for running stats)
|
||||
// The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
|
||||
|
@ -1771,7 +1772,8 @@ public:
|
|||
|
||||
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
|
||||
{
|
||||
assert(!m_convertRunningVariance);
|
||||
if (m_convertRunningVariancePending)
|
||||
LogicError("%ls: Failed to convert running variance until forward prop", NodeName().c_str());
|
||||
FrameRange fr(Input(0)->GetMBLayout());
|
||||
|
||||
Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
|
||||
|
@ -1792,12 +1794,16 @@ public:
|
|||
double expAvgFactor = ComputeExpAvgFactor(); // weight for the new MB statistics in the running estimate. The previous value of the running statistics is kept with weight (1-this)
|
||||
double blendFactor = ComputeBlendFactor(); // interpolation weight for the running statistics (the current MB statistics are weighted with 1-this)
|
||||
|
||||
m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias, // (in)
|
||||
expAvgFactor, blendFactor,
|
||||
runMean, runVariance, // (in/out) running estimates, updated from the current MB mean/variance
|
||||
/*out=*/ sliceOutputValue, // (out) batch-normalized output value
|
||||
// In inference-only mode, m_savedMean and m_saveInvStdDev will not be
|
||||
// produced and BackpropToNonLooping() may not be called. In
|
||||
// non-inference (training) mode, saved statistics must be produced.
|
||||
bool inferenceOnly = !Environment().IsTraining();
|
||||
m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias, // (in)
|
||||
inferenceOnly, expAvgFactor, blendFactor,
|
||||
runMean, runVariance, // (in/out) running estimates, updated from the current MB mean/variance
|
||||
/*out=*/ sliceOutputValue, // (out) batch-normalized output value
|
||||
m_epsilon,
|
||||
*m_saveMean, *m_saveInvStdDev); // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine
|
||||
*m_savedMean, *m_savedInvStdDev); // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine
|
||||
}
|
||||
|
||||
// Note: This function assumes that inputIndex=0 is called before the others.
|
||||
|
@ -1805,6 +1811,16 @@ public:
|
|||
// BUGBUG: If the input has no learnables (e.g. using BN instead of corpus mean/var norm), this will not be called for inputIndex=0 at all.
|
||||
virtual void BackpropToNonLooping(size_t inputIndex) override
|
||||
{
|
||||
// Must be in training mode.
|
||||
if (!Environment().IsTraining())
|
||||
LogicError("%ls: BackpropToNonLooping() cannot be called in inference mode", NodeName().c_str());
|
||||
// In non-inference mode, the batch normalization engine must provide
|
||||
// saved statistics, m_savedMean and m_savedInvStdDev
|
||||
if (m_savedMean->IsEmpty())
|
||||
LogicError("%ls: m_savedMean cannot be empty", NodeName().c_str());
|
||||
if (m_savedInvStdDev->IsEmpty())
|
||||
LogicError("%ls: m_savedInvStdDev cannot be empty", NodeName().c_str());
|
||||
|
||||
FrameRange fr(Input(0)->GetMBLayout());
|
||||
|
||||
if (inputIndex == 0) // derivative with respect to the input.
|
||||
|
@ -1813,16 +1829,8 @@ public:
|
|||
auto sliceInputValue = Input(0)->ValueFor(fr);
|
||||
const Matrix<ElemType>& scale = Input(1)->Value();
|
||||
const Matrix<ElemType>& bias = Input(2)->Value();
|
||||
const Matrix<ElemType>& runMean = Input(3)->Value();
|
||||
const Matrix<ElemType>& runVariance = Input(4)->Value();
|
||||
|
||||
auto sliceInputGrad = Input(0)->GradientFor(fr);
|
||||
// The mean used in Forward() are either saveMean or runMean.
|
||||
// This is decided by the engine, which communicates back the decision by returning
|
||||
// an empty saveMean in case runMean should be used. Likewise for variance / inverted standard deviation.
|
||||
let& actualMean = !m_saveMean->IsEmpty() ? *m_saveMean : runMean; // empty if only the running mean is used
|
||||
if (m_saveInvStdDev->IsEmpty()) RuntimeError("TODO m_saveInvStdDev <-> runVariance not the same:");
|
||||
let& actualInvStdDev = !m_saveInvStdDev->IsEmpty() ? *m_saveInvStdDev : runVariance;
|
||||
m_dScale->Resize(scale); // gradients for scale and bias get stored here
|
||||
m_dBias->Resize(bias);
|
||||
|
||||
|
@ -1833,7 +1841,7 @@ public:
|
|||
sliceInputGrad, // (out) gradient for data input goes here
|
||||
scale, // (in) out of scale and bias, only scale is needed in gradient propagation
|
||||
blendFactor, // (in) smoothing weight for running stats (1=use only running stats)
|
||||
actualMean, actualInvStdDev, // (in) actual mean/stddev values used in ForwardProp()
|
||||
*m_savedMean, *m_savedInvStdDev, // (in) saved mean/invstddev values used in ForwardProp()
|
||||
*m_dScale, *m_dBias); // (out) gradients for scale and bias
|
||||
}
|
||||
else if (inputIndex == 1) // derivative with respect to the scale
|
||||
|
@ -1855,6 +1863,7 @@ public:
|
|||
|
||||
virtual void EndBackprop() override
|
||||
{
|
||||
// TODO don't update in locked mode
|
||||
auto numSamples = GetMBLayout()->GetActualNumSamples();
|
||||
m_samplesSeen += numSamples;
|
||||
Base::EndBackprop();
|
||||
|
@ -1892,19 +1901,22 @@ public:
|
|||
|
||||
if (isFinalValidationPass)
|
||||
{
|
||||
if (m_convertRunningVariance)
|
||||
if (m_convertRunningVariancePending)
|
||||
{
|
||||
// Input 4 is still inverse standard deviation. We convert it to variance, approximately,
|
||||
// and output a warning.
|
||||
fprintf(stderr, "WARNING: %ls: approximately converting inverse standard deviation into variance when loading pre-CuDNNv5 model\n",
|
||||
// Prior to CNTK CuDNN v5 support (and the CNTK engine of the same time), mean and inverse standard deviation
|
||||
// statistics were computed and stored. With CuDNN v5 (and the corresponding CNTK engine update), this was changed
|
||||
// to mean and variance.
|
||||
// To load an old model for further training or inference, Input(4) (which is inverse standard deviation) needs to
|
||||
// be converted to variance, via v = 1/(isd^2) + epsilon, where 'v' is variance and 'isd' is inverse standard
|
||||
// Since this is an approximation, we output a warning.
|
||||
fprintf(stderr, "WARNING: %ls: loading pre-CuDNNv5 model and approximately converting variance statistics format\n",
|
||||
NodeName().c_str());
|
||||
Matrix<ElemType>& runInvStdDev = Input(4)->Value();
|
||||
runInvStdDev.AssignElementPowerOf(runInvStdDev, 2);
|
||||
runInvStdDev.ElementInverse();
|
||||
runInvStdDev += (float) m_epsilon;
|
||||
fprintf(stderr, "--- %ls converted runVariance after loading\n", NodeName().c_str());
|
||||
runInvStdDev.Print();
|
||||
m_convertRunningVariance = false;
|
||||
m_convertRunningVariancePending = false;
|
||||
}
|
||||
|
||||
// check inputs
|
||||
|
@ -1949,8 +1961,8 @@ public:
|
|||
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
||||
RequestMatrixFromPool(m_saveMean, matrixPool);
|
||||
RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
|
||||
RequestMatrixFromPool(m_savedMean, matrixPool);
|
||||
RequestMatrixFromPool(m_savedInvStdDev, matrixPool);
|
||||
}
|
||||
|
||||
void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
|
||||
|
@ -1963,8 +1975,8 @@ public:
|
|||
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_saveMean, matrixPool);
|
||||
ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
|
||||
ReleaseMatrixToPool(m_savedMean, matrixPool);
|
||||
ReleaseMatrixToPool(m_savedInvStdDev, matrixPool);
|
||||
ReleaseMatrixToPool(m_dScale, matrixPool);
|
||||
ReleaseMatrixToPool(m_dBias, matrixPool);
|
||||
}
|
||||
|
@ -2035,7 +2047,7 @@ private:
|
|||
// REVIEW alexeyk: if this works, document it properly in Wiki.
|
||||
double m_blendTimeConst;
|
||||
|
||||
// Epsilon used to compute inverse standard deviation (m_saveInvStdDev).
|
||||
// Epsilon used to compute inverse standard deviation (m_savedInvStdDev).
|
||||
double m_epsilon;
|
||||
// Whether to use CNTK or cuDNN BN implementation.
|
||||
bool m_useCntkEngine;
|
||||
|
@ -2048,8 +2060,8 @@ private:
|
|||
size_t m_samplesSeen;
|
||||
|
||||
// Interpolated actual mean/inverse stddev values. Pre-computed on forward pass, also used in gradient computation.
|
||||
shared_ptr<Matrix<ElemType>> m_saveMean;
|
||||
shared_ptr<Matrix<ElemType>> m_saveInvStdDev;
|
||||
shared_ptr<Matrix<ElemType>> m_savedMean;
|
||||
shared_ptr<Matrix<ElemType>> m_savedInvStdDev;
|
||||
// Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls.
|
||||
// Not used for blendFactor=1 in CNTK engine.
|
||||
shared_ptr<Matrix<ElemType>> m_dScale;
|
||||
|
@ -2057,7 +2069,7 @@ private:
|
|||
|
||||
std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
|
||||
|
||||
bool m_convertRunningVariance;
|
||||
bool m_convertRunningVariancePending;
|
||||
};
|
||||
|
||||
template class BatchNormalizationNode<float>;
|
||||
|
|
|
@ -10,14 +10,16 @@
|
|||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <class ElemType>
|
||||
void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev)
|
||||
void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev)
|
||||
{
|
||||
assert(in.GetNumRows() == m_inOutT.GetNumElements());
|
||||
assert(out.GetNumRows() == m_inOutT.GetNumElements());
|
||||
assert(in.GetNumCols() == out.GetNumCols());
|
||||
assert(std::isfinite(expAvgFactor) && (0 <= expAvgFactor && expAvgFactor <= 1));
|
||||
assert(std::isfinite(blendFactor) && (0 <= blendFactor && blendFactor <= 1));
|
||||
// In inference mode, must only use runnig statististics
|
||||
assert(!inferenceOnly || ((expAvgFactor == 0.0) && (blendFactor == 1.0)));
|
||||
assert(std::isfinite(epsilon) && epsilon > 0);
|
||||
if (!m_spatial)
|
||||
{
|
||||
|
@ -39,28 +41,35 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
|
|||
assert(runVariance.GetNumCols() == 1);
|
||||
|
||||
EnsureCompatible();
|
||||
ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev);
|
||||
ForwardCore(in, scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev);
|
||||
|
||||
if (!m_spatial)
|
||||
if (!inferenceOnly)
|
||||
{
|
||||
assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
|
||||
assert(!savedMean.IsEmpty());
|
||||
assert(!savedInvStdDev.IsEmpty());
|
||||
if (!m_spatial)
|
||||
{
|
||||
assert(m_inOutT.GetNumElements() == savedMean.GetNumRows());
|
||||
assert(m_inOutT.GetNumElements() == savedInvStdDev.GetNumRows());
|
||||
}
|
||||
else
|
||||
{
|
||||
assert((m_inOutT.GetNumElements() % savedMean.GetNumRows()) == 0);
|
||||
assert((m_inOutT.GetNumElements() % savedInvStdDev.GetNumRows()) == 0);
|
||||
}
|
||||
assert(savedMean.GetNumCols() == 1);
|
||||
assert(savedInvStdDev.GetNumCols() == 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
|
||||
}
|
||||
assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor,
|
||||
const Mat& saveMean, const Mat& saveInvStdDev, Mat& scaleGrad, Mat& biasGrad)
|
||||
const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad)
|
||||
{
|
||||
assert(!savedMean.IsEmpty());
|
||||
assert(!savedInvStdDev.IsEmpty());
|
||||
EnsureCompatible();
|
||||
BackwardCore(in, srcGrad, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
|
||||
BackwardCore(in, srcGrad, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -89,24 +98,23 @@ protected:
|
|||
InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout.");
|
||||
}
|
||||
|
||||
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
|
||||
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
|
||||
{
|
||||
in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev);
|
||||
in.BatchNormalizationForward(scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev);
|
||||
}
|
||||
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad) override
|
||||
{
|
||||
srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
|
||||
srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad);
|
||||
}
|
||||
};
|
||||
|
||||
template class CntkBatchNormEngine<float>;
|
||||
template class CntkBatchNormEngine<double>;
|
||||
|
||||
template <typename T>
|
||||
bool HasFlag(T src, T testFlag)
|
||||
template <typename T> bool HasFlag(T src, T testFlag)
|
||||
{
|
||||
return ((int)src & (int)testFlag) != 0;
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ public:
|
|||
public:
|
||||
virtual ~BatchNormEngine() = default;
|
||||
|
||||
void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
void Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
|
||||
|
||||
void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
|
@ -56,7 +56,7 @@ protected:
|
|||
virtual void EnsureCompatible() = 0;
|
||||
|
||||
// saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched
|
||||
virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
|
||||
|
||||
virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
|
|
|
@ -4372,20 +4372,19 @@ void CPUMatrix<ElemType>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol,
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
|
||||
CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance, CPUMatrix<ElemType>& out, double epsilon,
|
||||
CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const
|
||||
{
|
||||
assert((GetNumRows() % scale.GetNumRows()) == 0);
|
||||
|
||||
if (expAvgFactor != 0 || blendFactor != 1)
|
||||
if (!inferenceOnly || expAvgFactor != 0 || blendFactor != 1)
|
||||
RuntimeError("Batch normalization training on CPU is not yet implemented.");
|
||||
|
||||
saveMean.Resize(0, 0); // only doing inference: these two are not produced
|
||||
saveInvStdDev.Resize(0, 0);
|
||||
|
||||
bool spatial = GetNumRows() != scale.GetNumRows();
|
||||
size_t batchSize = GetNumCols();
|
||||
if (spatial)
|
||||
{
|
||||
size_t spatialSize = GetNumRows() / scale.GetNumRows();
|
||||
|
@ -4395,7 +4394,7 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
|
|||
for (long irow = 0; irow < out.GetNumRows(); irow++)
|
||||
{
|
||||
size_t imap = irow / spatialSize;
|
||||
ElemType stdDev = sqrt(runVariance(imap, 0) * (batchSize - 1) / batchSize + epsilon);
|
||||
ElemType stdDev = sqrt(runVariance(imap, 0) + epsilon);
|
||||
out(irow, icol) = scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) / stdDev + bias(imap, 0);
|
||||
}
|
||||
}
|
||||
|
@ -4407,7 +4406,7 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
|
|||
{
|
||||
for (long irow = 0; irow < out.GetNumRows(); irow++)
|
||||
{
|
||||
ElemType stdDev = sqrt(runVariance(irow, 0) * (batchSize - 1) / batchSize + epsilon);
|
||||
ElemType stdDev = sqrt(runVariance(irow, 0) + epsilon);
|
||||
out(irow, icol) = scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) / stdDev + bias(irow, 0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,8 +27,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
double logadd(double x, double y);
|
||||
|
||||
//To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
|
||||
//conversion is need when passing data between CPUMatrix and C++ matrices
|
||||
// To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
|
||||
// conversion is need when passing data between CPUMatrix and C++ matrices
|
||||
template <class ElemType>
|
||||
class MATH_API CPUMatrix : public BaseMatrix<ElemType>
|
||||
{
|
||||
|
@ -375,7 +375,7 @@ public:
|
|||
void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
|
||||
CPUMatrix<ElemType>& grad) const;
|
||||
|
||||
void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runStdDev,
|
||||
void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance,
|
||||
CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
|
||||
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
|
||||
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
|
||||
|
|
|
@ -203,7 +203,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
assert(::isfinite(epsilon) && epsilon > 0);
|
||||
assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
|
||||
assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
|
||||
assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update)
|
||||
// TODO optimize for (expAvgFactor == 0) && (blendFactor == 1)
|
||||
|
||||
int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
|
||||
if (irowSrcBase >= vectorSize)
|
||||
|
@ -346,7 +346,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
|
|||
im2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
|
||||
if (blendFactor != 0)
|
||||
{
|
||||
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] * (batchSize - 1) / batchSize + epsilon));
|
||||
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
|
||||
im2[k] = blendFactor * runInvStdDev + (1.0 - blendFactor) * im2[k];
|
||||
}
|
||||
}
|
||||
|
@ -376,7 +376,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
|
|||
assert((vectorSize % spatialSize) == 0);
|
||||
assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
|
||||
assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
|
||||
assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update)
|
||||
// TODO optimize for (expAvgFactor == 0) && (blendFactor == 1)
|
||||
assert(::isfinite(epsilon) && epsilon > 0);
|
||||
|
||||
int irowSrcBase = blockIdx.x * spatialSize + threadIdx.x * U;
|
||||
|
@ -506,7 +506,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
|
|||
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
|
||||
if (blendFactor != 0)
|
||||
{
|
||||
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] * (batchSize - 1) / batchSize + epsilon));
|
||||
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
|
||||
xInvStdDev[blockIdx.x] = blendFactor * runInvStdDev + (1.0 - blendFactor) * xInvStdDev[blockIdx.x];
|
||||
}
|
||||
}
|
||||
|
@ -729,7 +729,7 @@ struct NormalizeBatchTraining
|
|||
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
|
||||
__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
|
||||
const ElemType* saveMean, const ElemType* saveInvStdDev)
|
||||
const ElemType* savedMean, const ElemType* savedInvStdDev)
|
||||
{
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
|
@ -754,8 +754,8 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
|
|||
// Read mean and inv std dev.
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
LoadValues<U>(saveMean + irowSrcBase, mean);
|
||||
LoadValues<U>(saveInvStdDev + irowSrcBase, invStdDev);
|
||||
LoadValues<U>(savedMean + irowSrcBase, mean);
|
||||
LoadValues<U>(savedInvStdDev + irowSrcBase, invStdDev);
|
||||
StoreValues<U>(mean, &meanS[threadIdx.x * U]);
|
||||
StoreValues<U>(invStdDev, &invStdDevS[threadIdx.x * U]);
|
||||
}
|
||||
|
@ -833,7 +833,7 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
|
|||
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
|
||||
__global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev)
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev)
|
||||
{
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
|
@ -859,8 +859,8 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
|
|||
// Read mean and inv std dev.
|
||||
if (tid == 0)
|
||||
{
|
||||
meanS = saveMean[blockIdx.x];
|
||||
invStdDevS = saveInvStdDev[blockIdx.x];
|
||||
meanS = savedMean[blockIdx.x];
|
||||
invStdDevS = savedInvStdDev[blockIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
if (tid != 0)
|
||||
|
@ -922,7 +922,7 @@ struct ComputeScaleAndBiasGradients
|
|||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
const int BlockDimX = 32 / U;
|
||||
|
@ -931,7 +931,7 @@ struct ComputeScaleAndBiasGradients
|
|||
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
|
||||
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
|
||||
kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
|
||||
static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -940,7 +940,7 @@ struct ComputeSpatialScaleAndBiasGradients
|
|||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((spatialSize % U) == 0);
|
||||
assert((vectorSize % spatialSize) == 0);
|
||||
|
@ -951,7 +951,7 @@ struct ComputeSpatialScaleAndBiasGradients
|
|||
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
|
||||
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
|
||||
kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -959,7 +959,7 @@ struct ComputeSpatialScaleAndBiasGradients
|
|||
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
|
||||
__global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
|
||||
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
|
||||
const ElemType* saveMean, const ElemType* saveInvStdDev)
|
||||
const ElemType* savedMean, const ElemType* savedInvStdDev)
|
||||
{
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
|
@ -991,8 +991,8 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
|
|||
scale[k] = bnScale[imap];
|
||||
ds[k] = dScale[imap];
|
||||
db[k] = dBias[imap];
|
||||
mean[k] = saveMean[imap];
|
||||
invStdDev[k] = saveInvStdDev[imap];
|
||||
mean[k] = savedMean[imap];
|
||||
invStdDev[k] = savedInvStdDev[imap];
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -1000,8 +1000,8 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
|
|||
LoadValues<U>(bnScale + irowBase, scale);
|
||||
LoadValues<U>(dScale + irowBase, ds);
|
||||
LoadValues<U>(dBias + irowBase, db);
|
||||
LoadValues<U>(saveMean + irowBase, mean);
|
||||
LoadValues<U>(saveInvStdDev + irowBase, invStdDev);
|
||||
LoadValues<U>(savedMean + irowBase, mean);
|
||||
LoadValues<U>(savedInvStdDev + irowBase, invStdDev);
|
||||
}
|
||||
|
||||
int icol = blockIdx.y * BlockDimY + threadIdx.y;
|
||||
|
@ -1053,7 +1053,7 @@ struct BackpropagateBatchNormGradients
|
|||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
|
||||
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
|
||||
const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
|
||||
const ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
const int BlockDimX = 32 / U;
|
||||
|
@ -1064,12 +1064,12 @@ struct BackpropagateBatchNormGradients
|
|||
if (spatial)
|
||||
{
|
||||
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
|
||||
}
|
||||
else
|
||||
{
|
||||
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false/*not spatial*/, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -42,9 +42,11 @@ protected:
|
|||
InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions.");
|
||||
}
|
||||
|
||||
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runStdDev,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
|
||||
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
|
||||
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
|
||||
{
|
||||
UNUSED(inferenceOnly); // TODO
|
||||
|
||||
// REVIEW alexeyk: there might be a way to do this in cuDNN.
|
||||
if (blendFactor != 0 && (blendFactor != 1 || expAvgFactor > 0))
|
||||
InvalidArgument("cuDNN batch normalization engine currently supports blendTimeConstant of 0 or 1 only.");
|
||||
|
@ -53,25 +55,25 @@ protected:
|
|||
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
// cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
|
||||
epsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
|
||||
// expAvgFactor == 0 && blendFactor == 1 means we are in inference mode.
|
||||
if (expAvgFactor == 0 && blendFactor == 1)
|
||||
if (inferenceOnly)
|
||||
{
|
||||
saveMean.Resize(0, 0); // (these are not produced in this case)
|
||||
saveInvStdDev.Resize(0, 0);
|
||||
assert(expAvgFactor == 0 && blendFactor == 1);
|
||||
savedMean.Resize(0, 0); // (these are not produced in this case)
|
||||
savedInvStdDev.Resize(0, 0);
|
||||
CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out),
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runStdDev), epsilon));
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runVariance), epsilon));
|
||||
}
|
||||
else
|
||||
{
|
||||
saveMean.Resize(runMean);
|
||||
saveInvStdDev.Resize(runMean);
|
||||
savedMean.Resize(runMean);
|
||||
savedInvStdDev.Resize(runMean);
|
||||
CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in),
|
||||
m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runStdDev),
|
||||
epsilon, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runVariance),
|
||||
epsilon, ptr(savedMean), ptr(savedInvStdDev)));
|
||||
}
|
||||
}
|
||||
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad) override
|
||||
{
|
||||
UNUSED(blendFactor); // BUGBUG: It should be used.
|
||||
|
@ -79,7 +81,7 @@ protected:
|
|||
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
// REVIEW alexeyk: change betaParamDiff to 1 and update CNTK BN engine.
|
||||
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(savedMean), ptr(savedInvStdDev)));
|
||||
}
|
||||
|
||||
private:
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
#include "TensorShape.h"
|
||||
#include <cudnn.h>
|
||||
#if CUDNN_MAJOR < 5
|
||||
#error CNTK needs CuDNN version 5.0 or higher, cf. https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cudnn or https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#cudnn.
|
||||
#error CNTK requires the NVIDIA cuDNN library 5.0 or higher to build, cf. https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cudnn or https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#cudnn for installation instructions.
|
||||
#endif
|
||||
#include <memory>
|
||||
|
||||
|
|
|
@ -67,7 +67,7 @@ cudaStream_t MATH_API GetStream()
|
|||
template <class ElemType> \
|
||||
GPUMatrix<ElemType>& GPUMatrix<ElemType>::Inplace##f() \
|
||||
{ \
|
||||
performElementWiseFunction(ElementWiseOperator::op##f, Data()); \
|
||||
performElementWiseFunction(ElementWiseOperator::op##f, Data()); \
|
||||
return *this; \
|
||||
}
|
||||
#define DEF_ELEMWISE_ASSIGN_FUNC(f) \
|
||||
|
@ -77,8 +77,8 @@ cudaStream_t MATH_API GetStream()
|
|||
if (a.IsEmpty()) \
|
||||
LogicError("Assign##f##Of: Matrix a is empty."); \
|
||||
if (this != &a) \
|
||||
RequireSize(a.GetNumRows(), a.GetNumCols()); \
|
||||
performElementWiseFunction(ElementWiseOperator::op##f, a.Data()); \
|
||||
RequireSize(a.GetNumRows(), a.GetNumCols()); \
|
||||
performElementWiseFunction(ElementWiseOperator::op##f, a.Data()); \
|
||||
return *this; \
|
||||
}
|
||||
|
||||
|
@ -3160,12 +3160,13 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
|
|||
Data(), (int)GetNumRows(), grad.Data(), (int)grad.GetNumRows());
|
||||
}
|
||||
|
||||
// returns saveMean/saveInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
|
||||
// returns savedMean/savedInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
|
||||
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
|
||||
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
|
||||
GPUMatrix<ElemType>& savedMean, GPUMatrix<ElemType>& savedInvStdDev) const
|
||||
{
|
||||
UNUSED(inferenceOnly); // TODO
|
||||
assert((GetNumRows() % scale.GetNumRows()) == 0);
|
||||
|
||||
bool spatial = GetNumRows() != scale.GetNumRows();
|
||||
|
@ -3178,52 +3179,54 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
|
|||
assert(0 < batchSize && batchSize <= std::numeric_limits<int>::max());
|
||||
|
||||
SyncGuard syncGuard;
|
||||
if (expAvgFactor > 0 || blendFactor < 1)
|
||||
//if (expAvgFactor > 0 || blendFactor < 1)
|
||||
if (inferenceOnly)
|
||||
{
|
||||
// Compute data mean and inverse standard deviation (into saveMean and
|
||||
// saveInvStdDev), and update running mean and variance.
|
||||
// Pick running statistics for normalizing. No update reuqired, and
|
||||
// saved statistics do not need to be produced.
|
||||
assert(expAvgFactor == 0 && blendFactor == 1);
|
||||
normalizeRunningStats = true;
|
||||
savedMean.RequireSize(0, 0);
|
||||
savedInvStdDev.RequireSize(0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Compute data mean and inverse standard deviation (into savedMean and
|
||||
// savedInvStdDev), and update running mean and variance.
|
||||
// TODO expAvgFactor == 0 && blendFactor == 1 can be optimized (no need for update).
|
||||
normalizeRunningStats = false;
|
||||
saveMean.RequireSize(runMean);
|
||||
saveInvStdDev.RequireSize(runMean);
|
||||
savedMean.RequireSize(runMean);
|
||||
savedInvStdDev.RequireSize(runMean);
|
||||
if (spatial)
|
||||
{
|
||||
Call<ComputeSpatialBatchMeanAndInvStdDev, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, Data(),
|
||||
expAvgFactor, blendFactor,
|
||||
runMean.Data(), runVariance.Data(), epsilon,
|
||||
saveMean.Data(), saveInvStdDev.Data(), GetStream());
|
||||
savedMean.Data(), savedInvStdDev.Data(), GetStream());
|
||||
}
|
||||
else
|
||||
{
|
||||
Call<ComputeBatchMeanAndInvStdDev, ElemType>(vectorSize, vectorSize, batchSize, Data(),
|
||||
expAvgFactor, blendFactor,
|
||||
runMean.Data(), runVariance.Data(), epsilon,
|
||||
saveMean.Data(), saveInvStdDev.Data(), GetStream());
|
||||
savedMean.Data(), savedInvStdDev.Data(), GetStream());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// With expAvgFactor == 0 and blendFactor == 1 the running statistics
|
||||
// do not need to be updated. CNTK engine in this case returns saveMean
|
||||
// and saveInvStdDev empty, but cuDNN engine does not.
|
||||
normalizeRunningStats = true;
|
||||
saveMean.RequireSize(0, 0);
|
||||
saveInvStdDev.RequireSize(0, 0);
|
||||
}
|
||||
|
||||
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
|
||||
normalizeRunningStats, epsilon,
|
||||
Data(), out.Data(),
|
||||
scale.Data(), bias.Data(),
|
||||
runMean.Data(), runVariance.Data(),
|
||||
saveMean.Data(), saveInvStdDev.Data(),
|
||||
savedMean.Data(), savedInvStdDev.Data(),
|
||||
GetStream());
|
||||
}
|
||||
|
||||
// saveMean/saveInvStdDev are the interpolated mean/inverse standard deviation as used in ForwardProp().
|
||||
// savedMean/savedInvStdDev are the interpolated mean/inverse standard deviation as used in ForwardProp().
|
||||
// For blendFactor=1, they are not used and can be uninitialized or empty.
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
|
||||
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
|
||||
const GPUMatrix<ElemType>& savedMean, const GPUMatrix<ElemType>& savedInvStdDev,
|
||||
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
|
||||
{
|
||||
assert((GetNumRows() % scale.GetNumRows()) == 0);
|
||||
|
@ -3240,16 +3243,16 @@ void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>&
|
|||
if (spatial)
|
||||
{
|
||||
Call<ComputeSpatialScaleAndBiasGradients, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(),
|
||||
saveMean.Data(), saveInvStdDev.Data(), GetStream());
|
||||
savedMean.Data(), savedInvStdDev.Data(), GetStream());
|
||||
}
|
||||
else
|
||||
{
|
||||
Call<ComputeScaleAndBiasGradients, ElemType>(vectorSize, vectorSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(),
|
||||
saveMean.Data(), saveInvStdDev.Data(), GetStream());
|
||||
savedMean.Data(), savedInvStdDev.Data(), GetStream());
|
||||
}
|
||||
ElemType mbStatsWeight = (ElemType)(1 - blendFactor); // weight for contribution from actual MB stats (0 if none, e.g. locked BN node)
|
||||
Call<BackpropagateBatchNormGradients, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
|
||||
in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), saveMean.Data(), saveInvStdDev.Data(), GetStream());
|
||||
in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), savedMean.Data(), savedInvStdDev.Data(), GetStream());
|
||||
}
|
||||
|
||||
#pragma region Static BLAS Functions
|
||||
|
|
|
@ -467,8 +467,8 @@ public:
|
|||
void AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const;
|
||||
void AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const;
|
||||
|
||||
void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runStdDev, GPUMatrix<ElemType>& out, double epsilon,
|
||||
void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
|
||||
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
|
||||
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const;
|
||||
void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
|
||||
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
|
||||
|
|
|
@ -197,6 +197,6 @@ if exist "$(CuDnnDll)" xcopy /D /Y "$(CuDnnDll)" "$(OutputPath)"
|
|||
</ImportGroup>
|
||||
<Target Name="CheckDependencies">
|
||||
<Error Condition="!Exists('$(CUB_PATH)')" Text="CNTK requires the NVIDIA CUB library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
|
||||
<Error Condition="!Exists('$(CUDNN_PATH)')" Text="CNTK requires the NVIDIA cuDNN library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
|
||||
<Error Condition="!Exists('$(CUDNN_PATH)')" Text="CNTK requires the NVIDIA cuDNN library 5.0 or higher to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
|
||||
</Target>
|
||||
</Project>
|
||||
|
|
|
@ -4274,8 +4274,8 @@ void Matrix<ElemType>::AveragePoolingBackward(const Matrix<int>& mpRowCol, const
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
Matrix<ElemType>& runMean, Matrix<ElemType>& runStdDev, Matrix<ElemType>& out, double epsilon,
|
||||
void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
|
||||
Matrix<ElemType>& runMean, Matrix<ElemType>& runVariance, Matrix<ElemType>& out, double epsilon,
|
||||
Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const
|
||||
{
|
||||
DecideAndMoveToRightDevice(*this, out);
|
||||
|
@ -4283,11 +4283,11 @@ void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale,
|
|||
// REVIEW alexeyk: add sparse version.
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), expAvgFactor, blendFactor,
|
||||
*(runMean.m_CPUMatrix), *(runStdDev.m_CPUMatrix),
|
||||
m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), inferenceOnly, expAvgFactor, blendFactor,
|
||||
*(runMean.m_CPUMatrix), *(runVariance.m_CPUMatrix),
|
||||
*(out.m_CPUMatrix), epsilon, *(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix)),
|
||||
m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), expAvgFactor, blendFactor,
|
||||
*(runMean.m_GPUMatrix), *(runStdDev.m_GPUMatrix),
|
||||
m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), inferenceOnly, expAvgFactor, blendFactor,
|
||||
*(runMean.m_GPUMatrix), *(runVariance.m_GPUMatrix),
|
||||
*(out.m_GPUMatrix), epsilon, *(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix)),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
|
|
|
@ -500,8 +500,8 @@ public:
|
|||
void AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
|
||||
void AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const;
|
||||
|
||||
void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
Matrix<ElemType>& runMean, Matrix<ElemType>& runStdDev, Matrix<ElemType>& out, double epsilon,
|
||||
void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
|
||||
Matrix<ElemType>& runMean, Matrix<ElemType>& runVariance, Matrix<ElemType>& out, double epsilon,
|
||||
Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const;
|
||||
void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, double blendFactor, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
|
||||
Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const;
|
||||
|
|
|
@ -1827,7 +1827,7 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
|
|||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runStdDev, GPUMatrix<ElemType>& out, double epsilon,
|
||||
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
|
||||
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
|
||||
{
|
||||
}
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
# Synthetic test for non-spatial batch normalization, (almost) in isolation of
|
||||
# other nodes (in particular Convolution). This is not a general example and/or
|
||||
# documenting best practices. The network was based on MNIST/01_OneHidden.
|
||||
|
||||
rootDir = ".."
|
||||
|
||||
configDir = "$rootDir$/Config"
|
||||
|
@ -17,10 +21,6 @@ traceLevel = 1
|
|||
|
||||
batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
|
||||
|
||||
#######################################
|
||||
# TRAINING CONFIG #
|
||||
#######################################
|
||||
|
||||
train = [
|
||||
action = "train"
|
||||
|
||||
|
@ -54,10 +54,6 @@ train = [
|
|||
]
|
||||
]
|
||||
|
||||
#######################################
|
||||
# TEST CONFIG #
|
||||
#######################################
|
||||
|
||||
test = [
|
||||
action = "test"
|
||||
minibatchSize = 1024 # reduce this if you run out of memory
|
||||
|
|
|
@ -22,8 +22,8 @@ DNN = [
|
|||
b = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0)
|
||||
sc = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 1)
|
||||
m = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
y = BatchNormalization(h1, sc, b, m, var, eval=false, spatial=false, normalizationTimeConstant=64, imageLayout=cudnn, engine=$batchNormalizationEngine$)
|
||||
v = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
y = BatchNormalization(h1, sc, b, m, v, eval=false, spatial=false, normalizationTimeConstant=64, imageLayout=cudnn, engine=$batchNormalizationEngine$)
|
||||
|
||||
ol = DNNLayer(hiddenDim, labelDim, y, 1)
|
||||
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
# Synthetic test for spatial batch normalization, (almost) in isolation of
|
||||
# other nodes (in particular Convolution). This is not a general example and/or
|
||||
# documenting best practices. The network was based on CIFAR-10/02_BatchNormConv.
|
||||
|
||||
RootDir = "."
|
||||
|
||||
ConfigDir = "$RootDir$"
|
||||
|
|
|
@ -22,8 +22,7 @@ ndlMnistMacros = [
|
|||
scValue = 1
|
||||
|
||||
# Batch normalization time constant.
|
||||
#bnTimeConst = 4096
|
||||
bnTimeConst = 2048
|
||||
bnTimeConst = 0
|
||||
|
||||
fc1WScale = 12
|
||||
fc1BValue = 0
|
||||
|
@ -42,9 +41,9 @@ DNN=[
|
|||
b = LearnableParameter(cMap1, 1, init = fixedValue, value = conv1BValue)
|
||||
sc = LearnableParameter(cMap1, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
y = BatchNormalization(featScaled, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=$batchNormalizationEngine$)
|
||||
y = BatchNormalization(featScaled, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=$batchNormalizationEngine$)
|
||||
conv1 = RectifiedLinear(y)
|
||||
|
||||
# pool1
|
||||
|
|
|
@ -19,13 +19,13 @@ testCases:
|
|||
Epochs must be finished with expected results:
|
||||
patterns:
|
||||
- Finished Epoch
|
||||
- CE = {{float,tolerance=2.0%}} * {{integer}}
|
||||
- Err = {{float,tolerance=2.0%}} * {{integer}}
|
||||
- CE = {{float,tolerance=0.1%}} * {{integer}}
|
||||
- Err = {{float,tolerance=0.1%}} * {{integer}}
|
||||
- totalSamplesSeen = {{integer}}
|
||||
- learningRatePerSample = {{float,tolerance=0.001%}}
|
||||
|
||||
Final test results must match:
|
||||
patterns:
|
||||
- "Final Results: Minibatch[{{integer}}-{{integer}}]"
|
||||
- Err = {{float,tolerance=2.0%}} * {{integer}}
|
||||
- CE = {{float,tolerance=2.0%}} * {{integer}}
|
||||
- Err = {{float,tolerance=0.1%}} * {{integer}}
|
||||
- CE = {{float,tolerance=0.1%}} * {{integer}}
|
||||
|
|
|
@ -19,13 +19,13 @@ testCases:
|
|||
Epochs must be finished with expected results:
|
||||
patterns:
|
||||
- Finished Epoch
|
||||
- CE = {{float,tolerance=2.0%}} * {{integer}}
|
||||
- Err = {{float,tolerance=2.0%}} * {{integer}}
|
||||
- CE = {{float,tolerance=0.1%}} * {{integer}}
|
||||
- Err = {{float,tolerance=0.1%}} * {{integer}}
|
||||
- totalSamplesSeen = {{integer}}
|
||||
- learningRatePerSample = {{float,tolerance=0.001%}}
|
||||
|
||||
Final test results must match:
|
||||
patterns:
|
||||
- "Final Results: Minibatch[{{integer}}-{{integer}}]"
|
||||
- Err = {{float,tolerance=2.0%}} * {{integer}}
|
||||
- CE = {{float,tolerance=2.0%}} * {{integer}}
|
||||
- Err = {{float,tolerance=0.1%}} * {{integer}}
|
||||
- CE = {{float,tolerance=0.1%}} * {{integer}}
|
||||
|
|
|
@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
|
|||
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
|
||||
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
|
||||
]
|
||||
|
||||
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
|
@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
|
|||
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
|
||||
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
|
||||
]
|
||||
|
||||
ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
|
||||
|
@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
|
|||
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
|
||||
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
|
@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
|
|||
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
|
||||
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
|
|
|
@ -32,4 +32,4 @@ testCases:
|
|||
- "Final Results: Minibatch[{{integer}}-{{integer}}]"
|
||||
# this part is flaky, needs to be investigated (VSO item #414)
|
||||
#- Err = {{float,tolerance=2.0%}} * {{integer}}
|
||||
- CE = {{float,tolerance=2.0%}} * {{integer}}
|
||||
- CE = {{float,tolerance=4.0%}} * {{integer}}
|
||||
|
|
|
@ -100,6 +100,7 @@ BOOST_AUTO_TEST_CASE(BatchNormalizationForward)
|
|||
double expAvg = std::get<3>(cfg);
|
||||
double blendFactor = 0; // cuDNN supports blendFactor == 0 (train) or 1 (eval) only.
|
||||
double eps = 1e-5; // CUDNN_BN_MIN_EPSILON
|
||||
bool inferenceOnly = false;
|
||||
|
||||
auto engCudnn = BNEng::Create(baseDeviceId, inOutT, spatial, ImageLayoutKind::CHW, BatchNormEngineKind::CuDnn);
|
||||
auto engCntk = BNEng::Create(deviceId, inOutT, spatial, ImageLayoutKind::CHW, BatchNormEngineKind::Cntk);
|
||||
|
@ -142,12 +143,12 @@ BOOST_AUTO_TEST_CASE(BatchNormalizationForward)
|
|||
|
||||
CudaTimer time1;
|
||||
time1.Start();
|
||||
engCntk->Forward(in, scale, bias, expAvg, blendFactor, runMean, runInvStdDev, out, eps, saveMean, saveInvStdDev);
|
||||
engCntk->Forward(in, scale, bias, inferenceOnly, expAvg, blendFactor, runMean, runInvStdDev, out, eps, saveMean, saveInvStdDev);
|
||||
time1.Stop();
|
||||
|
||||
CudaTimer time2;
|
||||
time2.Start();
|
||||
engCudnn->Forward(inB, scaleB, biasB, expAvg, blendFactor, runMeanB, runInvStdDevB, outB, eps, saveMeanB, saveInvStdDevB);
|
||||
engCudnn->Forward(inB, scaleB, biasB, inferenceOnly, expAvg, blendFactor, runMeanB, runInvStdDevB, outB, eps, saveMeanB, saveInvStdDevB);
|
||||
time2.Stop();
|
||||
|
||||
std::stringstream tmsg;
|
||||
|
|
|
@ -28,12 +28,12 @@ inline FunctionPtr ProjLayer(Variable wProj, Variable input, size_t hStride, siz
|
|||
auto b = Parameter({ outFeatureMapCount }, (float)bValue, device);
|
||||
auto sc = Parameter({ outFeatureMapCount }, (float)scValue, device);
|
||||
auto m = Constant({ outFeatureMapCount }, 0.0f, device);
|
||||
auto var = Constant({ outFeatureMapCount }, 0.0f, device);
|
||||
auto v = Constant({ outFeatureMapCount }, 0.0f, device);
|
||||
|
||||
size_t numInputChannels = input.Shape()[input.Shape().NumAxes() - 1];
|
||||
|
||||
auto c = Convolution(wProj, input, { hStride, vStride, numInputChannels }, { true }, { false });
|
||||
return BatchNormalization(c, sc, b, m, var, true /*spatial*/, (double)bnTimeConst);
|
||||
return BatchNormalization(c, sc, b, m, v, true /*spatial*/, (double)bnTimeConst);
|
||||
}
|
||||
|
||||
inline FunctionPtr ResNetNode2(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, double wScale, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)
|
||||
|
|
Загрузка…
Ссылка в новой задаче