new option initOutputRank to LearnableParameter to allow to control how fanIn and fanOut for random init are determined

This commit is contained in:
Frank Seide 2016-08-19 19:04:43 -07:00
Родитель 832a685d29
Коммит f24b4481ca
3 изменённых файлов: 52 добавлений и 22 удалений

Просмотреть файл

@ -171,7 +171,7 @@ LayerNormalizationLayer {dim = BS.Constants.None, initScale = 1, initBias = 0} =
}.apply
# StabilizerLayer -- create a scalar stabilizer [J. Droppo, 2014 -- TODO: get the reference]
StabilizerLayer {} =
StabilizerLayer{} =
{
# BUGBUG: Calling f(x) twice will create a second set of parameters. Needs to refactor Stabilize() for this.
apply (x) = BS.Parameters.Stabilize (x)
@ -180,7 +180,13 @@ StabilizerLayer {} =
# FeatureMVNLayer -- create a corpus-level feature-normalization layer
# This can only be applied to features. Statistics are not shared across invocations,
# which is semantically OK because the values are the same. However, it is not efficient.
FeatureMVNLayer {} = MeanVarNorm
FeatureMVNLayer{} = MeanVarNorm
# LogPriorLayer -- create a corpus-level label-prior layer
# This can only be applied to labels. Statistics are not shared across invocations,
# which is semantically OK because the values are the same. However, it is not efficient.
# TODO: document on Wiki
LogPriorLayer{} = LogPrior
# Layers that exist in other tools that we will not have:
# FlattenLayer{}: Not needed since DenseLayer() can handle tensors just fine.
@ -285,7 +291,7 @@ CNTK2 = [
// TODO: The API for Parameter is different in current 2.0 design, getting a constant as input for the initial values.
// This needs to be fixed to follow the way the Constant() is exposed in Python
// Making this an internal node with "_" until we agree on the final interface:
_Parameter(shape, value = 0, initValue = '', learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]
_Parameter(shape, value = 0, initValue = '', learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initOutputRank = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]
// 3. Shape operations
// Changes: NewReshape -> Reshape, input -> _, dims -> shape
@ -395,12 +401,12 @@ CNTK2 = [
# - initFromLiteral="..." (deprecated) --> parse a string literal (obsolete with value=array form)
# - init="fixedValue", value from 'value'
# Warning: Current config will behave unexpected if user mistypes 'initValue' as 'value' (which will be ignored, defaulting to "uniform" init)
Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; initOutputRank = 1 ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
LearnableParameter = Parameter // deprecated
# TODO: make Parameter take tensor dims?
ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initOutputRank = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, initFromLiteral = literal, learningRateMultiplier = 0.0)
# TODO: Deprecate ConstantFromString() in favor of Constant(array expression)
DynamicAxis(tag='') = new ComputationNode [ operation = 'DynamicAxis' ; /*plus the function args*/ ]
@ -553,16 +559,22 @@ IntDiv(x, y) = new NumericFunction [ what = 'IntDiv' ; args = (x:y) ]
# macros from NDL book
##############################################################################
# deprecated--use LinearLayer{} and DenseLayer{} instead
BFF(in, rows, cols) = [ B = Parameter(rows, 1, initValue = 0) ; W = Parameter(rows, cols) ; z = W*in+B ]
SBFF(in, rows, cols) = [ Eh = Sigmoid(BFF(in, rows, cols).z) ]
# deprecated--use FeatureMVNLayer{} instead
MeanVarNorm(feat) = PerDimMeanVarNormalization(feat, Mean(feat), InvStdDev(feat))
# deprecated--use LogPriorLayer{} instead
LogPrior(labels) = Log(Mean(labels))
# specify one of these two for initialization:
# - init = "uniform"|"gaussian"
# - embeddingFile = PATHNAME
# deprecated--use EmbeddingLayer{} instead
Embedding (embeddingDim, input, inputDim=input.dim, initFrom=''/*|fromFile|gaussian|uniform*/, embeddingPath = '', sparseInput = false, learningRateWeight = 0.0) = [
embedding = Transpose (LearnableParameter (inputDim, embeddingDim, learningRateMultiplier = learningRateWeight, init = initFrom, initFromFilePath = embeddingPath))
embedding = Transpose (Parameter (inputDim, embeddingDim, learningRateMultiplier = learningRateWeight, init = initFrom, initFromFilePath = embeddingPath))
lookup = if sparseInput then embedding * input
else GatherPacked (input, embedding)
].lookup

Просмотреть файл

@ -41,6 +41,11 @@ static pair<bool/*uniform*/, double/*stddev or range*/> ParseRandomizationType(c
// - init="fixedValue", value from 'value' --deprecated in favor of just specifying initValue
// - init="fromFile", value from 'initFromFilePath' --deprecated in favor of just specifying 'initFromFilePath'
// - init="fromLiteral", value from 'initFromLiteral' --deprecated in favor of initValue=array expression
// Random initialization takes an additional optional parameter initOutputRank, default 1.
// All dimensions that are not amongst the first 'initOutputRank' are considered inputs.
// This is necessary e.g. for convolution.
// 'initOutputRank' can also be negative to denote output dims on the right, to cater to the needs
// of convolution kernels where the output rank is the right-most axis (initOutputRank=-1).
// The forms that infer the dimensions have different BrainScript names. TODO: need one for fromFile
// TODO: All forms that require specified dimensions but contain zeroes (to be updated by graph)
// will need to do deferred initialization, or have a way to repeat it.
@ -91,7 +96,8 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
int forcedRandomSeed = configp->Get(L"randomSeed"); // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
m_randomSeed = forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed;
m_initValueScale = configp->Get(L"initValueScale");
m_initOnCPUOnly = configp->Get(L"initOnCPUOnly");
m_initOutputRank = configp->Get(L"initOutputRank");
m_initOnCPUOnly = configp->Get(L"initOnCPUOnly");
}
else if (initString == L"zero")
{
@ -155,6 +161,7 @@ void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString,
m_initString = initString;
m_randomSeed = randomSeed;
m_initValueScale = initValue;
m_initOutputRank = 1; // default. NDL (deprecated) cannot specify a different value.
m_initOnCPUOnly = initOnCPUOnly;
}
else if (initString == L"fixedValue") // from constant value
@ -200,23 +207,30 @@ template <class ElemType>
void LearnableParameter<ElemType>::InitRandom(const std::wstring& type,
const unsigned long randomSeed,
const ElemType initValueScale,
bool initOnCPUOnly)
const int initOutputRank,
const bool initOnCPUOnly)
{
// fprintf(stderr, "%d x %d: %d %ls\n", (int)GetNumRows(), (int)GetNumCols(), (int)randomSeed, NodeName().c_str());
let& sampleLayout = GetSampleLayout();
#if 1 // this more complex version is needed to repro test cases generated with an older version
auto& value = sampleLayout.GetRank() > 2 ? Value() : ValueAsMatrix();
#else
auto& value = Value();
#endif
let numElements = sampleLayout.GetNumElements();
if (numElements == 0)
return;
// We assume that the matrix row dimension is the output dimension. This is wrong in case of ND biases, convolution filters, and BatchNorm.
size_t fanIn = value.GetNumCols(); // fan-in
size_t fanOut = numElements / fanIn; // remaining dimensions
// determine fan-in and fan-out
// This is controlled by initOutputRank.
// For a normal matrix [I x J], fanOut = I, fanIn = J=inDim --> initOutputRank = +1
// For a convolution kernel [w x h x C x K], fanOut = K, fanIn = w*h*C. --> initOutputRank = -1, meaning count from back
if (abs(initOutputRank) > sampleLayout.GetRank())
InvalidArgument("InitRandom: initOutputRank=%d exceeds sampleLayout rank %d", initOutputRank, (int)sampleLayout.GetRank());
// fanIn is determined by multiplying a range of dimensions:
// - initOutputRank >= 0: [ initOutputRank, rank )
// - initOutputRank < 0: [ 0, rank-abs(initOutputRank) )
let inDimsBegin = (initOutputRank >= 0) ? (size_t)initOutputRank : 0;
let inDimsEnd = (initOutputRank >= 0) ? sampleLayout.GetRank() : (size_t)((int)sampleLayout.GetRank() + initOutputRank);
size_t fanIn = 1;
for (size_t k = inDimsBegin; k < inDimsEnd; k++)
fanIn *= sampleLayout[k];
let fanOut = numElements / fanIn; // remaining dimensions
let opts = ParseRandomizationType(type, fanOut, fanIn);
let isUniform = opts.first;
ElemType range = (ElemType)opts.second;
@ -224,14 +238,16 @@ void LearnableParameter<ElemType>::InitRandom(const std::wstring& type,
LogicError("InitRandom: Invalid initialization type '%ls'", type.c_str());
// the random seed offset is set via the "randomSeedOffset" parameter in config
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, range=%f*%f, onCPU=%s).\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(), (int)m_randomSeed, range, m_initValueScale, m_initOnCPUOnly ? "true" : "false");
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f*%f, onCPU=%s).\n",
NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(),
(int)m_randomSeed, (int)fanOut, (int)fanIn, range, m_initValueScale, m_initOnCPUOnly ? "true" : "false");
range *= initValueScale;
if (initOnCPUOnly)
Value().TransferToDeviceIfNotThere(CPUDEVICE, true);
if (isUniform)
value.SetUniformRandomValue(-range, range, randomSeed);
Value().SetUniformRandomValue(-range, range, randomSeed);
else
value.SetGaussianRandomValue(0, range, randomSeed);
Value().SetGaussianRandomValue(0, range, randomSeed);
if (initOnCPUOnly)
Value().TransferToDeviceIfNotThere(m_deviceId, true);
}
@ -365,6 +381,7 @@ template <class ElemType>
node->m_initString = m_initString;
node->m_randomSeed = m_randomSeed;
node->m_initValueScale = m_initValueScale;
node->m_initOutputRank = m_initOutputRank;
node->m_initOnCPUOnly = m_initOnCPUOnly;
node->m_initValue = m_initValue;
}
@ -439,7 +456,7 @@ void LearnableParameter<ElemType>::LazyInitParameters()
}
else if (ParseRandomizationType(m_initString).second != 0)
{
InitRandom(m_initString, m_randomSeed, m_initValueScale, m_initOnCPUOnly);
InitRandom(m_initString, m_randomSeed, m_initValueScale, m_initOutputRank, m_initOnCPUOnly);
}
else
LogicError("LearnableParameter: Invalid value of m_initString '%ls' for deferred initialization for %ls.", m_initString.c_str(), NodeDescription().c_str());

Просмотреть файл

@ -62,7 +62,7 @@ public:
private:
// initialize with random numbers
// If 'initOnCPUOnly' then always init on CPU, making initialization consistent across both (for testing).
void InitRandom(const std::wstring& type, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly);
void InitRandom(const std::wstring& type, const unsigned long randomSeed, const ElemType initValueScale, const int initOutputRank, const bool initOnCPUOnly);
// helper to initialize from a matrix read from a text file or a string literal
void InitFromArray(const std::vector<ElemType>& array, size_t numRows, size_t numCols);
@ -103,6 +103,7 @@ private:
std::wstring m_initString; // if non-empty then deferred initialization is needed. Gets cleared upon completion of deferred init.
unsigned long m_randomSeed;
ElemType m_initValueScale;
int m_initOutputRank;
bool m_initOnCPUOnly;
ElemType m_initValue;
};