This commit is contained in:
Mark Hillebrand 2016-08-23 17:57:00 +02:00
Родитель 6aea378f90
Коммит e1a9cabbde
29 изменённых файлов: 242 добавлений и 218 удалений

Просмотреть файл

@ -26,9 +26,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
bn = BatchNormalization(t, sc, b, m, v, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
]
@ -72,10 +72,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
b = LearnableParameter(outMap, 1, init=fixedValue, value=bValue)
sc = LearnableParameter(outMap, 1, init=fixedValue, value=scValue)
m = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
var = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
v = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
y = BatchNormalization(c, sc, b, m, v, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
]
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [

Просмотреть файл

@ -26,9 +26,9 @@ DnnBNReLULayer (inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
b = Parameter (outDim, 1, init = "fixedValue", value = bValue)
sc = Parameter (outDim, 1, init = "fixedValue", value = scValue)
m = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
var = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
v = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
t = Times(W, x) # TODO: W * x
bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
bn = BatchNormalization(t, sc, b, m, v, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
].y
@ -61,10 +61,10 @@ ConvBNLayerW (W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeC
b = Parameter(outMap, 1, init="fixedValue", value=bValue)
sc = Parameter(outMap, 1, init="fixedValue", value=scValue)
m = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
var = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
v = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true /* , imageLayout=$imageLayout$*/)
y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
y = BatchNormalization(c, sc, b, m, v, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
].y
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [

Просмотреть файл

@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
]
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
]
ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
]
@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
]

Просмотреть файл

@ -8,9 +8,9 @@ BN(inp, mapCount, bValue, scValue, bnTimeConst)
b = Parameter(mapCount, 1, init = fixedValue, value = bValue)
sc = Parameter(mapCount, 1, init = fixedValue, value = scValue)
m = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
y = BatchNormalization(inp, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
y = BatchNormalization(inp, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
]
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)

Просмотреть файл

@ -15,9 +15,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
b = Parameter(outDim, 1, init = fixedValue, value = bValue)
sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, var, spatial = false)
bn = BatchNormalization(t, sc, b, m, v, spatial = false)
y = RectifiedLinear(bn)
]
@ -47,9 +47,9 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
bn = BatchNormalization(c, sc, b, m, var, spatial = true, imageLayout = "cudnn")
bn = BatchNormalization(c, sc, b, m, v, spatial = true, imageLayout = "cudnn")
y = RectifiedLinear(bn);
]

Просмотреть файл

@ -1547,7 +1547,7 @@ template class DropoutNode<double>;
//
// m = mean(input)
// var = variance(input)
// input_norm = (input - mean) / sqrt(var)
// input_norm = (input - mean) / sqrt(epsilon + var)
// output = gamma * input_norm + beta
//
// where gamma and beta are trainable parameters(represented as LearnableParameter).
@ -1570,7 +1570,7 @@ template class DropoutNode<double>;
// * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch.
// Value 0 (default) means no blending will happen and only the current minibatch statistics will be used.
// Value 1#INF (infinity) means only running mean / var will be used(this is used, for example, in evaluation phase).
// * epsilon is a conditioner constant used in computing inverted standard deviation
// * epsilon is a conditioner constant used in computing inverse standard deviation
// * useCntkEngine is a Boolean flag that specifies which batch normalization implementation to use: CNTK or cuDNN-based.
// * imageLayout is the image layout. Only cudnn is supported at present.
// -----------------------------------------------------------------------
@ -1584,14 +1584,14 @@ public:
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name) :
Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
m_samplesSeen(0), m_imageLayoutKind(ImageLayoutKind::CHW),
m_convertRunningVariance(false)
m_convertRunningVariancePending(false)
{
}
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant,
double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind) :
Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_samplesSeen(0),
m_convertRunningVariance(false)
m_convertRunningVariancePending(false)
{
}
BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
@ -1627,7 +1627,9 @@ public:
fstream >> m_normTimeConst;
fstream >> m_blendTimeConst;
fstream >> m_imageLayoutKind;
#ifdef _DEBUG
fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str());
#endif
fstream >> mbCount;
m_samplesSeen = mbCount;
fstream >> m_epsilon;
@ -1666,7 +1668,9 @@ public:
{
size_t mbCount;
fstream >> m_imageLayoutKind;
#ifdef _DEBUG
fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str());
#endif
fstream >> mbCount;
m_samplesSeen = mbCount;
}
@ -1681,8 +1685,9 @@ public:
{
// Prior to CNTK_MODEL_VERSION_12, running inverse standard
// deviation was stored in Input 4. Now variance is used.
// We (approximately) convert it during validation later.
m_convertRunningVariance = true;
// We (approximately) convert it during validation later
// (and then clear the flag).
m_convertRunningVariancePending = true;
}
}
@ -1713,15 +1718,13 @@ private: // time-constant conversions
// in inference mode, only use long-term mean and do not update running estimates
if (!Environment().IsTraining())
{
assert(0 < m_samplesSeen); // something must have been trained
if (m_samplesSeen == 0)
RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str());
return 0; // (m_normTimeConst == infinity) no new contribution from current minibatch
}
// Initialization case: only use current minibatch.
if (m_samplesSeen == 0)
{
return 1.0;
}
if (m_samplesSeen == 0) return 1.0;
double numSamples = (double)GetMBLayout()->GetActualNumSamples();
@ -1747,15 +1750,13 @@ private: // time-constant conversions
// in inference mode, only use long-term mean and do not update running estimates
if (!Environment().IsTraining())
{
assert(0 < m_samplesSeen); // something must have been trained
if (m_samplesSeen == 0)
RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str());
return 1.0; // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate
}
// Initialization case: only use current minibatch.
if (m_samplesSeen == 0)
{
return 0;
}
if (m_samplesSeen == 0) return 0;
// convert to blend factor (= weight for running stats)
// The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
@ -1771,7 +1772,8 @@ public:
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
{
assert(!m_convertRunningVariance);
if (m_convertRunningVariancePending)
LogicError("%ls: Failed to convert running variance until forward prop", NodeName().c_str());
FrameRange fr(Input(0)->GetMBLayout());
Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
@ -1792,12 +1794,16 @@ public:
double expAvgFactor = ComputeExpAvgFactor(); // weight for the new MB statistics in the running estimate. The previous value of the running statistics is kept with weight (1-this)
double blendFactor = ComputeBlendFactor(); // interpolation weight for the running statistics (the current MB statistics are weighted with 1-this)
// In inference-only mode, m_savedMean and m_saveInvStdDev will not be
// produced and BackpropToNonLooping() may not be called. In
// non-inference (training) mode, saved statistics must be produced.
bool inferenceOnly = !Environment().IsTraining();
m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias, // (in)
expAvgFactor, blendFactor,
inferenceOnly, expAvgFactor, blendFactor,
runMean, runVariance, // (in/out) running estimates, updated from the current MB mean/variance
/*out=*/ sliceOutputValue, // (out) batch-normalized output value
m_epsilon,
*m_saveMean, *m_saveInvStdDev); // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine
*m_savedMean, *m_savedInvStdDev); // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine
}
// Note: This function assumes that inputIndex=0 is called before the others.
@ -1805,6 +1811,16 @@ public:
// BUGBUG: If the input has no learnables (e.g. using BN instead of corpus mean/var norm), this will not be called for inputIndex=0 at all.
virtual void BackpropToNonLooping(size_t inputIndex) override
{
// Must be in training mode.
if (!Environment().IsTraining())
LogicError("%ls: BackpropToNonLooping() cannot be called in inference mode", NodeName().c_str());
// In non-inference mode, the batch normalization engine must provide
// saved statistics, m_savedMean and m_savedInvStdDev
if (m_savedMean->IsEmpty())
LogicError("%ls: m_savedMean cannot be empty", NodeName().c_str());
if (m_savedInvStdDev->IsEmpty())
LogicError("%ls: m_savedInvStdDev cannot be empty", NodeName().c_str());
FrameRange fr(Input(0)->GetMBLayout());
if (inputIndex == 0) // derivative with respect to the input.
@ -1813,16 +1829,8 @@ public:
auto sliceInputValue = Input(0)->ValueFor(fr);
const Matrix<ElemType>& scale = Input(1)->Value();
const Matrix<ElemType>& bias = Input(2)->Value();
const Matrix<ElemType>& runMean = Input(3)->Value();
const Matrix<ElemType>& runVariance = Input(4)->Value();
auto sliceInputGrad = Input(0)->GradientFor(fr);
// The mean used in Forward() are either saveMean or runMean.
// This is decided by the engine, which communicates back the decision by returning
// an empty saveMean in case runMean should be used. Likewise for variance / inverted standard deviation.
let& actualMean = !m_saveMean->IsEmpty() ? *m_saveMean : runMean; // empty if only the running mean is used
if (m_saveInvStdDev->IsEmpty()) RuntimeError("TODO m_saveInvStdDev <-> runVariance not the same:");
let& actualInvStdDev = !m_saveInvStdDev->IsEmpty() ? *m_saveInvStdDev : runVariance;
m_dScale->Resize(scale); // gradients for scale and bias get stored here
m_dBias->Resize(bias);
@ -1833,7 +1841,7 @@ public:
sliceInputGrad, // (out) gradient for data input goes here
scale, // (in) out of scale and bias, only scale is needed in gradient propagation
blendFactor, // (in) smoothing weight for running stats (1=use only running stats)
actualMean, actualInvStdDev, // (in) actual mean/stddev values used in ForwardProp()
*m_savedMean, *m_savedInvStdDev, // (in) saved mean/invstddev values used in ForwardProp()
*m_dScale, *m_dBias); // (out) gradients for scale and bias
}
else if (inputIndex == 1) // derivative with respect to the scale
@ -1855,6 +1863,7 @@ public:
virtual void EndBackprop() override
{
// TODO don't update in locked mode
auto numSamples = GetMBLayout()->GetActualNumSamples();
m_samplesSeen += numSamples;
Base::EndBackprop();
@ -1892,19 +1901,22 @@ public:
if (isFinalValidationPass)
{
if (m_convertRunningVariance)
if (m_convertRunningVariancePending)
{
// Input 4 is still inverse standard deviation. We convert it to variance, approximately,
// and output a warning.
fprintf(stderr, "WARNING: %ls: approximately converting inverse standard deviation into variance when loading pre-CuDNNv5 model\n",
// Prior to CNTK CuDNN v5 support (and the CNTK engine of the same time), mean and inverse standard deviation
// statistics were computed and stored. With CuDNN v5 (and the corresponding CNTK engine update), this was changed
// to mean and variance.
// To load an old model for further training or inference, Input(4) (which is inverse standard deviation) needs to
// be converted to variance, via v = 1/(isd^2) + epsilon, where 'v' is variance and 'isd' is inverse standard
// Since this is an approximation, we output a warning.
fprintf(stderr, "WARNING: %ls: loading pre-CuDNNv5 model and approximately converting variance statistics format\n",
NodeName().c_str());
Matrix<ElemType>& runInvStdDev = Input(4)->Value();
runInvStdDev.AssignElementPowerOf(runInvStdDev, 2);
runInvStdDev.ElementInverse();
runInvStdDev += (float) m_epsilon;
fprintf(stderr, "--- %ls converted runVariance after loading\n", NodeName().c_str());
runInvStdDev.Print();
m_convertRunningVariance = false;
m_convertRunningVariancePending = false;
}
// check inputs
@ -1949,8 +1961,8 @@ public:
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
{
Base::RequestMatricesBeforeForwardProp(matrixPool);
RequestMatrixFromPool(m_saveMean, matrixPool);
RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
RequestMatrixFromPool(m_savedMean, matrixPool);
RequestMatrixFromPool(m_savedInvStdDev, matrixPool);
}
void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
@ -1963,8 +1975,8 @@ public:
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_saveMean, matrixPool);
ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
ReleaseMatrixToPool(m_savedMean, matrixPool);
ReleaseMatrixToPool(m_savedInvStdDev, matrixPool);
ReleaseMatrixToPool(m_dScale, matrixPool);
ReleaseMatrixToPool(m_dBias, matrixPool);
}
@ -2035,7 +2047,7 @@ private:
// REVIEW alexeyk: if this works, document it properly in Wiki.
double m_blendTimeConst;
// Epsilon used to compute inverse standard deviation (m_saveInvStdDev).
// Epsilon used to compute inverse standard deviation (m_savedInvStdDev).
double m_epsilon;
// Whether to use CNTK or cuDNN BN implementation.
bool m_useCntkEngine;
@ -2048,8 +2060,8 @@ private:
size_t m_samplesSeen;
// Interpolated actual mean/inverse stddev values. Pre-computed on forward pass, also used in gradient computation.
shared_ptr<Matrix<ElemType>> m_saveMean;
shared_ptr<Matrix<ElemType>> m_saveInvStdDev;
shared_ptr<Matrix<ElemType>> m_savedMean;
shared_ptr<Matrix<ElemType>> m_savedInvStdDev;
// Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls.
// Not used for blendFactor=1 in CNTK engine.
shared_ptr<Matrix<ElemType>> m_dScale;
@ -2057,7 +2069,7 @@ private:
std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
bool m_convertRunningVariance;
bool m_convertRunningVariancePending;
};
template class BatchNormalizationNode<float>;

Просмотреть файл

@ -10,14 +10,16 @@
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev)
void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev)
{
assert(in.GetNumRows() == m_inOutT.GetNumElements());
assert(out.GetNumRows() == m_inOutT.GetNumElements());
assert(in.GetNumCols() == out.GetNumCols());
assert(std::isfinite(expAvgFactor) && (0 <= expAvgFactor && expAvgFactor <= 1));
assert(std::isfinite(blendFactor) && (0 <= blendFactor && blendFactor <= 1));
// In inference mode, must only use runnig statististics
assert(!inferenceOnly || ((expAvgFactor == 0.0) && (blendFactor == 1.0)));
assert(std::isfinite(epsilon) && epsilon > 0);
if (!m_spatial)
{
@ -39,28 +41,35 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
assert(runVariance.GetNumCols() == 1);
EnsureCompatible();
ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev);
ForwardCore(in, scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev);
if (!inferenceOnly)
{
assert(!savedMean.IsEmpty());
assert(!savedInvStdDev.IsEmpty());
if (!m_spatial)
{
assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
assert(m_inOutT.GetNumElements() == savedMean.GetNumRows());
assert(m_inOutT.GetNumElements() == savedInvStdDev.GetNumRows());
}
else
{
assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
assert((m_inOutT.GetNumElements() % savedMean.GetNumRows()) == 0);
assert((m_inOutT.GetNumElements() % savedInvStdDev.GetNumRows()) == 0);
}
assert(savedMean.GetNumCols() == 1);
assert(savedInvStdDev.GetNumCols() == 1);
}
assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
}
template <class ElemType>
void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor,
const Mat& saveMean, const Mat& saveInvStdDev, Mat& scaleGrad, Mat& biasGrad)
const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad)
{
assert(!savedMean.IsEmpty());
assert(!savedInvStdDev.IsEmpty());
EnsureCompatible();
BackwardCore(in, srcGrad, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
BackwardCore(in, srcGrad, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad);
}
template <class ElemType>
@ -89,24 +98,23 @@ protected:
InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout.");
}
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
{
in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev);
in.BatchNormalizationForward(scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev);
}
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
Mat& scaleGrad, Mat& biasGrad) override
{
srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad);
}
};
template class CntkBatchNormEngine<float>;
template class CntkBatchNormEngine<double>;
template <typename T>
bool HasFlag(T src, T testFlag)
template <typename T> bool HasFlag(T src, T testFlag)
{
return ((int)src & (int)testFlag) != 0;
}

Просмотреть файл

@ -34,7 +34,7 @@ public:
public:
virtual ~BatchNormEngine() = default;
void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
void Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
@ -56,7 +56,7 @@ protected:
virtual void EnsureCompatible() = 0;
// saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched
virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,

Просмотреть файл

@ -4372,20 +4372,19 @@ void CPUMatrix<ElemType>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol,
}
template <class ElemType>
void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance, CPUMatrix<ElemType>& out, double epsilon,
CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const
{
assert((GetNumRows() % scale.GetNumRows()) == 0);
if (expAvgFactor != 0 || blendFactor != 1)
if (!inferenceOnly || expAvgFactor != 0 || blendFactor != 1)
RuntimeError("Batch normalization training on CPU is not yet implemented.");
saveMean.Resize(0, 0); // only doing inference: these two are not produced
saveInvStdDev.Resize(0, 0);
bool spatial = GetNumRows() != scale.GetNumRows();
size_t batchSize = GetNumCols();
if (spatial)
{
size_t spatialSize = GetNumRows() / scale.GetNumRows();
@ -4395,7 +4394,7 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
for (long irow = 0; irow < out.GetNumRows(); irow++)
{
size_t imap = irow / spatialSize;
ElemType stdDev = sqrt(runVariance(imap, 0) * (batchSize - 1) / batchSize + epsilon);
ElemType stdDev = sqrt(runVariance(imap, 0) + epsilon);
out(irow, icol) = scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) / stdDev + bias(imap, 0);
}
}
@ -4407,7 +4406,7 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
{
for (long irow = 0; irow < out.GetNumRows(); irow++)
{
ElemType stdDev = sqrt(runVariance(irow, 0) * (batchSize - 1) / batchSize + epsilon);
ElemType stdDev = sqrt(runVariance(irow, 0) + epsilon);
out(irow, icol) = scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) / stdDev + bias(irow, 0);
}
}

Просмотреть файл

@ -27,8 +27,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
double logadd(double x, double y);
//To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
//conversion is need when passing data between CPUMatrix and C++ matrices
// To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
// conversion is need when passing data between CPUMatrix and C++ matrices
template <class ElemType>
class MATH_API CPUMatrix : public BaseMatrix<ElemType>
{
@ -375,7 +375,7 @@ public:
void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
CPUMatrix<ElemType>& grad) const;
void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runStdDev,
void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance,
CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;

Просмотреть файл

@ -203,7 +203,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
assert(::isfinite(epsilon) && epsilon > 0);
assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update)
// TODO optimize for (expAvgFactor == 0) && (blendFactor == 1)
int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowSrcBase >= vectorSize)
@ -346,7 +346,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
im2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
if (blendFactor != 0)
{
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] * (batchSize - 1) / batchSize + epsilon));
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
im2[k] = blendFactor * runInvStdDev + (1.0 - blendFactor) * im2[k];
}
}
@ -376,7 +376,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
assert((vectorSize % spatialSize) == 0);
assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update)
// TODO optimize for (expAvgFactor == 0) && (blendFactor == 1)
assert(::isfinite(epsilon) && epsilon > 0);
int irowSrcBase = blockIdx.x * spatialSize + threadIdx.x * U;
@ -506,7 +506,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
if (blendFactor != 0)
{
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] * (batchSize - 1) / batchSize + epsilon));
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
xInvStdDev[blockIdx.x] = blendFactor * runInvStdDev + (1.0 - blendFactor) * xInvStdDev[blockIdx.x];
}
}
@ -729,7 +729,7 @@ struct NormalizeBatchTraining
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
const ElemType* saveMean, const ElemType* saveInvStdDev)
const ElemType* savedMean, const ElemType* savedInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@ -754,8 +754,8 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
// Read mean and inv std dev.
if (threadIdx.y == 0)
{
LoadValues<U>(saveMean + irowSrcBase, mean);
LoadValues<U>(saveInvStdDev + irowSrcBase, invStdDev);
LoadValues<U>(savedMean + irowSrcBase, mean);
LoadValues<U>(savedInvStdDev + irowSrcBase, invStdDev);
StoreValues<U>(mean, &meanS[threadIdx.x * U]);
StoreValues<U>(invStdDev, &invStdDevS[threadIdx.x * U]);
}
@ -833,7 +833,7 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev)
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@ -859,8 +859,8 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
// Read mean and inv std dev.
if (tid == 0)
{
meanS = saveMean[blockIdx.x];
invStdDevS = saveInvStdDev[blockIdx.x];
meanS = savedMean[blockIdx.x];
invStdDevS = savedInvStdDev[blockIdx.x];
}
__syncthreads();
if (tid != 0)
@ -922,7 +922,7 @@ struct ComputeScaleAndBiasGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
const int BlockDimX = 32 / U;
@ -931,7 +931,7 @@ struct ComputeScaleAndBiasGradients
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
}
};
@ -940,7 +940,7 @@ struct ComputeSpatialScaleAndBiasGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
{
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
@ -951,7 +951,7 @@ struct ComputeSpatialScaleAndBiasGradients
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
}
};
@ -959,7 +959,7 @@ struct ComputeSpatialScaleAndBiasGradients
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
__global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
const ElemType* saveMean, const ElemType* saveInvStdDev)
const ElemType* savedMean, const ElemType* savedInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@ -991,8 +991,8 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
scale[k] = bnScale[imap];
ds[k] = dScale[imap];
db[k] = dBias[imap];
mean[k] = saveMean[imap];
invStdDev[k] = saveInvStdDev[imap];
mean[k] = savedMean[imap];
invStdDev[k] = savedInvStdDev[imap];
}
}
else
@ -1000,8 +1000,8 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
LoadValues<U>(bnScale + irowBase, scale);
LoadValues<U>(dScale + irowBase, ds);
LoadValues<U>(dBias + irowBase, db);
LoadValues<U>(saveMean + irowBase, mean);
LoadValues<U>(saveInvStdDev + irowBase, invStdDev);
LoadValues<U>(savedMean + irowBase, mean);
LoadValues<U>(savedInvStdDev + irowBase, invStdDev);
}
int icol = blockIdx.y * BlockDimY + threadIdx.y;
@ -1053,7 +1053,7 @@ struct BackpropagateBatchNormGradients
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
const ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
const int BlockDimX = 32 / U;
@ -1064,12 +1064,12 @@ struct BackpropagateBatchNormGradients
if (spatial)
{
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
}
else
{
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false/*not spatial*/, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
}
}
};

Просмотреть файл

@ -42,9 +42,11 @@ protected:
InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions.");
}
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runStdDev,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
{
UNUSED(inferenceOnly); // TODO
// REVIEW alexeyk: there might be a way to do this in cuDNN.
if (blendFactor != 0 && (blendFactor != 1 || expAvgFactor > 0))
InvalidArgument("cuDNN batch normalization engine currently supports blendTimeConstant of 0 or 1 only.");
@ -53,25 +55,25 @@ protected:
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
// cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
epsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
// expAvgFactor == 0 && blendFactor == 1 means we are in inference mode.
if (expAvgFactor == 0 && blendFactor == 1)
if (inferenceOnly)
{
saveMean.Resize(0, 0); // (these are not produced in this case)
saveInvStdDev.Resize(0, 0);
assert(expAvgFactor == 0 && blendFactor == 1);
savedMean.Resize(0, 0); // (these are not produced in this case)
savedInvStdDev.Resize(0, 0);
CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out),
m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runStdDev), epsilon));
m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runVariance), epsilon));
}
else
{
saveMean.Resize(runMean);
saveInvStdDev.Resize(runMean);
savedMean.Resize(runMean);
savedInvStdDev.Resize(runMean);
CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in),
m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runStdDev),
epsilon, ptr(saveMean), ptr(saveInvStdDev)));
m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runVariance),
epsilon, ptr(savedMean), ptr(savedInvStdDev)));
}
}
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
Mat& scaleGrad, Mat& biasGrad) override
{
UNUSED(blendFactor); // BUGBUG: It should be used.
@ -79,7 +81,7 @@ protected:
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
// REVIEW alexeyk: change betaParamDiff to 1 and update CNTK BN engine.
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(savedMean), ptr(savedInvStdDev)));
}
private:

Просмотреть файл

@ -9,7 +9,7 @@
#include "TensorShape.h"
#include <cudnn.h>
#if CUDNN_MAJOR < 5
#error CNTK needs CuDNN version 5.0 or higher, cf. https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cudnn or https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#cudnn.
#error CNTK requires the NVIDIA cuDNN library 5.0 or higher to build, cf. https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cudnn or https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#cudnn for installation instructions.
#endif
#include <memory>

Просмотреть файл

@ -3160,12 +3160,13 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
Data(), (int)GetNumRows(), grad.Data(), (int)grad.GetNumRows());
}
// returns saveMean/saveInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
// returns savedMean/savedInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
template <class ElemType>
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
GPUMatrix<ElemType>& savedMean, GPUMatrix<ElemType>& savedInvStdDev) const
{
UNUSED(inferenceOnly); // TODO
assert((GetNumRows() % scale.GetNumRows()) == 0);
bool spatial = GetNumRows() != scale.GetNumRows();
@ -3178,52 +3179,54 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
assert(0 < batchSize && batchSize <= std::numeric_limits<int>::max());
SyncGuard syncGuard;
if (expAvgFactor > 0 || blendFactor < 1)
//if (expAvgFactor > 0 || blendFactor < 1)
if (inferenceOnly)
{
// Compute data mean and inverse standard deviation (into saveMean and
// saveInvStdDev), and update running mean and variance.
// Pick running statistics for normalizing. No update reuqired, and
// saved statistics do not need to be produced.
assert(expAvgFactor == 0 && blendFactor == 1);
normalizeRunningStats = true;
savedMean.RequireSize(0, 0);
savedInvStdDev.RequireSize(0, 0);
}
else
{
// Compute data mean and inverse standard deviation (into savedMean and
// savedInvStdDev), and update running mean and variance.
// TODO expAvgFactor == 0 && blendFactor == 1 can be optimized (no need for update).
normalizeRunningStats = false;
saveMean.RequireSize(runMean);
saveInvStdDev.RequireSize(runMean);
savedMean.RequireSize(runMean);
savedInvStdDev.RequireSize(runMean);
if (spatial)
{
Call<ComputeSpatialBatchMeanAndInvStdDev, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, Data(),
expAvgFactor, blendFactor,
runMean.Data(), runVariance.Data(), epsilon,
saveMean.Data(), saveInvStdDev.Data(), GetStream());
savedMean.Data(), savedInvStdDev.Data(), GetStream());
}
else
{
Call<ComputeBatchMeanAndInvStdDev, ElemType>(vectorSize, vectorSize, batchSize, Data(),
expAvgFactor, blendFactor,
runMean.Data(), runVariance.Data(), epsilon,
saveMean.Data(), saveInvStdDev.Data(), GetStream());
savedMean.Data(), savedInvStdDev.Data(), GetStream());
}
}
else
{
// With expAvgFactor == 0 and blendFactor == 1 the running statistics
// do not need to be updated. CNTK engine in this case returns saveMean
// and saveInvStdDev empty, but cuDNN engine does not.
normalizeRunningStats = true;
saveMean.RequireSize(0, 0);
saveInvStdDev.RequireSize(0, 0);
}
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
normalizeRunningStats, epsilon,
Data(), out.Data(),
scale.Data(), bias.Data(),
runMean.Data(), runVariance.Data(),
saveMean.Data(), saveInvStdDev.Data(),
savedMean.Data(), savedInvStdDev.Data(),
GetStream());
}
// saveMean/saveInvStdDev are the interpolated mean/inverse standard deviation as used in ForwardProp().
// savedMean/savedInvStdDev are the interpolated mean/inverse standard deviation as used in ForwardProp().
// For blendFactor=1, they are not used and can be uninitialized or empty.
template <class ElemType>
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
const GPUMatrix<ElemType>& savedMean, const GPUMatrix<ElemType>& savedInvStdDev,
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
{
assert((GetNumRows() % scale.GetNumRows()) == 0);
@ -3240,16 +3243,16 @@ void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>&
if (spatial)
{
Call<ComputeSpatialScaleAndBiasGradients, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(),
saveMean.Data(), saveInvStdDev.Data(), GetStream());
savedMean.Data(), savedInvStdDev.Data(), GetStream());
}
else
{
Call<ComputeScaleAndBiasGradients, ElemType>(vectorSize, vectorSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(),
saveMean.Data(), saveInvStdDev.Data(), GetStream());
savedMean.Data(), savedInvStdDev.Data(), GetStream());
}
ElemType mbStatsWeight = (ElemType)(1 - blendFactor); // weight for contribution from actual MB stats (0 if none, e.g. locked BN node)
Call<BackpropagateBatchNormGradients, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), saveMean.Data(), saveInvStdDev.Data(), GetStream());
in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), savedMean.Data(), savedInvStdDev.Data(), GetStream());
}
#pragma region Static BLAS Functions

Просмотреть файл

@ -467,8 +467,8 @@ public:
void AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const;
void AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const;
void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runStdDev, GPUMatrix<ElemType>& out, double epsilon,
void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const;
void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,

Просмотреть файл

@ -197,6 +197,6 @@ if exist "$(CuDnnDll)" xcopy /D /Y "$(CuDnnDll)" "$(OutputPath)"
</ImportGroup>
<Target Name="CheckDependencies">
<Error Condition="!Exists('$(CUB_PATH)')" Text="CNTK requires the NVIDIA CUB library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
<Error Condition="!Exists('$(CUDNN_PATH)')" Text="CNTK requires the NVIDIA cuDNN library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
<Error Condition="!Exists('$(CUDNN_PATH)')" Text="CNTK requires the NVIDIA cuDNN library 5.0 or higher to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
</Target>
</Project>

Просмотреть файл

@ -4274,8 +4274,8 @@ void Matrix<ElemType>::AveragePoolingBackward(const Matrix<int>& mpRowCol, const
}
template <class ElemType>
void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
Matrix<ElemType>& runMean, Matrix<ElemType>& runStdDev, Matrix<ElemType>& out, double epsilon,
void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
Matrix<ElemType>& runMean, Matrix<ElemType>& runVariance, Matrix<ElemType>& out, double epsilon,
Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const
{
DecideAndMoveToRightDevice(*this, out);
@ -4283,11 +4283,11 @@ void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale,
// REVIEW alexeyk: add sparse version.
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), expAvgFactor, blendFactor,
*(runMean.m_CPUMatrix), *(runStdDev.m_CPUMatrix),
m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), inferenceOnly, expAvgFactor, blendFactor,
*(runMean.m_CPUMatrix), *(runVariance.m_CPUMatrix),
*(out.m_CPUMatrix), epsilon, *(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix)),
m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), expAvgFactor, blendFactor,
*(runMean.m_GPUMatrix), *(runStdDev.m_GPUMatrix),
m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), inferenceOnly, expAvgFactor, blendFactor,
*(runMean.m_GPUMatrix), *(runVariance.m_GPUMatrix),
*(out.m_GPUMatrix), epsilon, *(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix)),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);

Просмотреть файл

@ -500,8 +500,8 @@ public:
void AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
void AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const;
void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
Matrix<ElemType>& runMean, Matrix<ElemType>& runStdDev, Matrix<ElemType>& out, double epsilon,
void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
Matrix<ElemType>& runMean, Matrix<ElemType>& runVariance, Matrix<ElemType>& out, double epsilon,
Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const;
void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, double blendFactor, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const;

Просмотреть файл

@ -1827,7 +1827,7 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
template <class ElemType>
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runStdDev, GPUMatrix<ElemType>& out, double epsilon,
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
{
}

Просмотреть файл

@ -1,3 +1,7 @@
# Synthetic test for non-spatial batch normalization, (almost) in isolation of
# other nodes (in particular Convolution). This is not a general example and/or
# documenting best practices. The network was based on MNIST/01_OneHidden.
rootDir = ".."
configDir = "$rootDir$/Config"
@ -17,10 +21,6 @@ traceLevel = 1
batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
#######################################
# TRAINING CONFIG #
#######################################
train = [
action = "train"
@ -54,10 +54,6 @@ train = [
]
]
#######################################
# TEST CONFIG #
#######################################
test = [
action = "test"
minibatchSize = 1024 # reduce this if you run out of memory

Просмотреть файл

@ -22,8 +22,8 @@ DNN = [
b = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0)
sc = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 1)
m = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
y = BatchNormalization(h1, sc, b, m, var, eval=false, spatial=false, normalizationTimeConstant=64, imageLayout=cudnn, engine=$batchNormalizationEngine$)
v = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
y = BatchNormalization(h1, sc, b, m, v, eval=false, spatial=false, normalizationTimeConstant=64, imageLayout=cudnn, engine=$batchNormalizationEngine$)
ol = DNNLayer(hiddenDim, labelDim, y, 1)

Просмотреть файл

@ -1,3 +1,7 @@
# Synthetic test for spatial batch normalization, (almost) in isolation of
# other nodes (in particular Convolution). This is not a general example and/or
# documenting best practices. The network was based on CIFAR-10/02_BatchNormConv.
RootDir = "."
ConfigDir = "$RootDir$"

Просмотреть файл

@ -22,8 +22,7 @@ ndlMnistMacros = [
scValue = 1
# Batch normalization time constant.
#bnTimeConst = 4096
bnTimeConst = 2048
bnTimeConst = 0
fc1WScale = 12
fc1BValue = 0
@ -42,9 +41,9 @@ DNN=[
b = LearnableParameter(cMap1, 1, init = fixedValue, value = conv1BValue)
sc = LearnableParameter(cMap1, 1, init = fixedValue, value = scValue)
m = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
y = BatchNormalization(featScaled, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=$batchNormalizationEngine$)
y = BatchNormalization(featScaled, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=$batchNormalizationEngine$)
conv1 = RectifiedLinear(y)
# pool1

Просмотреть файл

@ -19,13 +19,13 @@ testCases:
Epochs must be finished with expected results:
patterns:
- Finished Epoch
- CE = {{float,tolerance=2.0%}} * {{integer}}
- Err = {{float,tolerance=2.0%}} * {{integer}}
- CE = {{float,tolerance=0.1%}} * {{integer}}
- Err = {{float,tolerance=0.1%}} * {{integer}}
- totalSamplesSeen = {{integer}}
- learningRatePerSample = {{float,tolerance=0.001%}}
Final test results must match:
patterns:
- "Final Results: Minibatch[{{integer}}-{{integer}}]"
- Err = {{float,tolerance=2.0%}} * {{integer}}
- CE = {{float,tolerance=2.0%}} * {{integer}}
- Err = {{float,tolerance=0.1%}} * {{integer}}
- CE = {{float,tolerance=0.1%}} * {{integer}}

Просмотреть файл

@ -19,13 +19,13 @@ testCases:
Epochs must be finished with expected results:
patterns:
- Finished Epoch
- CE = {{float,tolerance=2.0%}} * {{integer}}
- Err = {{float,tolerance=2.0%}} * {{integer}}
- CE = {{float,tolerance=0.1%}} * {{integer}}
- Err = {{float,tolerance=0.1%}} * {{integer}}
- totalSamplesSeen = {{integer}}
- learningRatePerSample = {{float,tolerance=0.001%}}
Final test results must match:
patterns:
- "Final Results: Minibatch[{{integer}}-{{integer}}]"
- Err = {{float,tolerance=2.0%}} * {{integer}}
- CE = {{float,tolerance=2.0%}} * {{integer}}
- Err = {{float,tolerance=0.1%}} * {{integer}}
- CE = {{float,tolerance=0.1%}} * {{integer}}

Просмотреть файл

@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
]
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
]
ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
y = RectifiedLinear(bn)
]
@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
y = RectifiedLinear(bn)
]

Просмотреть файл

@ -32,4 +32,4 @@ testCases:
- "Final Results: Minibatch[{{integer}}-{{integer}}]"
# this part is flaky, needs to be investigated (VSO item #414)
#- Err = {{float,tolerance=2.0%}} * {{integer}}
- CE = {{float,tolerance=2.0%}} * {{integer}}
- CE = {{float,tolerance=4.0%}} * {{integer}}

Просмотреть файл

@ -100,6 +100,7 @@ BOOST_AUTO_TEST_CASE(BatchNormalizationForward)
double expAvg = std::get<3>(cfg);
double blendFactor = 0; // cuDNN supports blendFactor == 0 (train) or 1 (eval) only.
double eps = 1e-5; // CUDNN_BN_MIN_EPSILON
bool inferenceOnly = false;
auto engCudnn = BNEng::Create(baseDeviceId, inOutT, spatial, ImageLayoutKind::CHW, BatchNormEngineKind::CuDnn);
auto engCntk = BNEng::Create(deviceId, inOutT, spatial, ImageLayoutKind::CHW, BatchNormEngineKind::Cntk);
@ -142,12 +143,12 @@ BOOST_AUTO_TEST_CASE(BatchNormalizationForward)
CudaTimer time1;
time1.Start();
engCntk->Forward(in, scale, bias, expAvg, blendFactor, runMean, runInvStdDev, out, eps, saveMean, saveInvStdDev);
engCntk->Forward(in, scale, bias, inferenceOnly, expAvg, blendFactor, runMean, runInvStdDev, out, eps, saveMean, saveInvStdDev);
time1.Stop();
CudaTimer time2;
time2.Start();
engCudnn->Forward(inB, scaleB, biasB, expAvg, blendFactor, runMeanB, runInvStdDevB, outB, eps, saveMeanB, saveInvStdDevB);
engCudnn->Forward(inB, scaleB, biasB, inferenceOnly, expAvg, blendFactor, runMeanB, runInvStdDevB, outB, eps, saveMeanB, saveInvStdDevB);
time2.Stop();
std::stringstream tmsg;

Просмотреть файл

@ -28,12 +28,12 @@ inline FunctionPtr ProjLayer(Variable wProj, Variable input, size_t hStride, siz
auto b = Parameter({ outFeatureMapCount }, (float)bValue, device);
auto sc = Parameter({ outFeatureMapCount }, (float)scValue, device);
auto m = Constant({ outFeatureMapCount }, 0.0f, device);
auto var = Constant({ outFeatureMapCount }, 0.0f, device);
auto v = Constant({ outFeatureMapCount }, 0.0f, device);
size_t numInputChannels = input.Shape()[input.Shape().NumAxes() - 1];
auto c = Convolution(wProj, input, { hStride, vStride, numInputChannels }, { true }, { false });
return BatchNormalization(c, sc, b, m, var, true /*spatial*/, (double)bnTimeConst);
return BatchNormalization(c, sc, b, m, v, true /*spatial*/, (double)bnTimeConst);
}
inline FunctionPtr ResNetNode2(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, double wScale, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)