new option initOutputRank to LearnableParameter to allow to control how fanIn and fanOut for random init are determined
This commit is contained in:
Родитель
832a685d29
Коммит
f24b4481ca
|
@ -171,7 +171,7 @@ LayerNormalizationLayer {dim = BS.Constants.None, initScale = 1, initBias = 0} =
|
|||
}.apply
|
||||
|
||||
# StabilizerLayer -- create a scalar stabilizer [J. Droppo, 2014 -- TODO: get the reference]
|
||||
StabilizerLayer {} =
|
||||
StabilizerLayer{} =
|
||||
{
|
||||
# BUGBUG: Calling f(x) twice will create a second set of parameters. Needs to refactor Stabilize() for this.
|
||||
apply (x) = BS.Parameters.Stabilize (x)
|
||||
|
@ -180,7 +180,13 @@ StabilizerLayer {} =
|
|||
# FeatureMVNLayer -- create a corpus-level feature-normalization layer
|
||||
# This can only be applied to features. Statistics are not shared across invocations,
|
||||
# which is semantically OK because the values are the same. However, it is not efficient.
|
||||
FeatureMVNLayer {} = MeanVarNorm
|
||||
FeatureMVNLayer{} = MeanVarNorm
|
||||
|
||||
# LogPriorLayer -- create a corpus-level label-prior layer
|
||||
# This can only be applied to labels. Statistics are not shared across invocations,
|
||||
# which is semantically OK because the values are the same. However, it is not efficient.
|
||||
# TODO: document on Wiki
|
||||
LogPriorLayer{} = LogPrior
|
||||
|
||||
# Layers that exist in other tools that we will not have:
|
||||
# FlattenLayer{}: Not needed since DenseLayer() can handle tensors just fine.
|
||||
|
@ -285,7 +291,7 @@ CNTK2 = [
|
|||
// TODO: The API for Parameter is different in current 2.0 design, getting a constant as input for the initial values.
|
||||
// This needs to be fixed to follow the way the Constant() is exposed in Python
|
||||
// Making this an internal node with "_" until we agree on the final interface:
|
||||
_Parameter(shape, value = 0, initValue = '', learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]
|
||||
_Parameter(shape, value = 0, initValue = '', learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initOutputRank = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]
|
||||
|
||||
// 3. Shape operations
|
||||
// Changes: NewReshape -> Reshape, input -> _, dims -> shape
|
||||
|
@ -395,12 +401,12 @@ CNTK2 = [
|
|||
# - initFromLiteral="..." (deprecated) --> parse a string literal (obsolete with value=array form)
|
||||
# - init="fixedValue", value from 'value'
|
||||
# Warning: Current config will behave unexpected if user mistypes 'initValue' as 'value' (which will be ignored, defaulting to "uniform" init)
|
||||
Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
|
||||
Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; initOutputRank = 1 ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
|
||||
|
||||
LearnableParameter = Parameter // deprecated
|
||||
|
||||
# TODO: make Parameter take tensor dims?
|
||||
ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
|
||||
ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initOutputRank = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
|
||||
ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, initFromLiteral = literal, learningRateMultiplier = 0.0)
|
||||
# TODO: Deprecate ConstantFromString() in favor of Constant(array expression)
|
||||
DynamicAxis(tag='') = new ComputationNode [ operation = 'DynamicAxis' ; /*plus the function args*/ ]
|
||||
|
@ -553,16 +559,22 @@ IntDiv(x, y) = new NumericFunction [ what = 'IntDiv' ; args = (x:y) ]
|
|||
# macros from NDL book
|
||||
##############################################################################
|
||||
|
||||
# deprecated--use LinearLayer{} and DenseLayer{} instead
|
||||
BFF(in, rows, cols) = [ B = Parameter(rows, 1, initValue = 0) ; W = Parameter(rows, cols) ; z = W*in+B ]
|
||||
SBFF(in, rows, cols) = [ Eh = Sigmoid(BFF(in, rows, cols).z) ]
|
||||
|
||||
# deprecated--use FeatureMVNLayer{} instead
|
||||
MeanVarNorm(feat) = PerDimMeanVarNormalization(feat, Mean(feat), InvStdDev(feat))
|
||||
|
||||
# deprecated--use LogPriorLayer{} instead
|
||||
LogPrior(labels) = Log(Mean(labels))
|
||||
|
||||
# specify one of these two for initialization:
|
||||
# - init = "uniform"|"gaussian"
|
||||
# - embeddingFile = PATHNAME
|
||||
# deprecated--use EmbeddingLayer{} instead
|
||||
Embedding (embeddingDim, input, inputDim=input.dim, initFrom=''/*|fromFile|gaussian|uniform*/, embeddingPath = '', sparseInput = false, learningRateWeight = 0.0) = [
|
||||
embedding = Transpose (LearnableParameter (inputDim, embeddingDim, learningRateMultiplier = learningRateWeight, init = initFrom, initFromFilePath = embeddingPath))
|
||||
embedding = Transpose (Parameter (inputDim, embeddingDim, learningRateMultiplier = learningRateWeight, init = initFrom, initFromFilePath = embeddingPath))
|
||||
lookup = if sparseInput then embedding * input
|
||||
else GatherPacked (input, embedding)
|
||||
].lookup
|
||||
|
|
|
@ -41,6 +41,11 @@ static pair<bool/*uniform*/, double/*stddev or range*/> ParseRandomizationType(c
|
|||
// - init="fixedValue", value from 'value' --deprecated in favor of just specifying initValue
|
||||
// - init="fromFile", value from 'initFromFilePath' --deprecated in favor of just specifying 'initFromFilePath'
|
||||
// - init="fromLiteral", value from 'initFromLiteral' --deprecated in favor of initValue=array expression
|
||||
// Random initialization takes an additional optional parameter initOutputRank, default 1.
|
||||
// All dimensions that are not amongst the first 'initOutputRank' are considered inputs.
|
||||
// This is necessary e.g. for convolution.
|
||||
// 'initOutputRank' can also be negative to denote output dims on the right, to cater to the needs
|
||||
// of convolution kernels where the output rank is the right-most axis (initOutputRank=-1).
|
||||
// The forms that infer the dimensions have different BrainScript names. TODO: need one for fromFile
|
||||
// TODO: All forms that require specified dimensions but contain zeroes (to be updated by graph)
|
||||
// will need to do deferred initialization, or have a way to repeat it.
|
||||
|
@ -91,7 +96,8 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
|
|||
int forcedRandomSeed = configp->Get(L"randomSeed"); // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
|
||||
m_randomSeed = forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed;
|
||||
m_initValueScale = configp->Get(L"initValueScale");
|
||||
m_initOnCPUOnly = configp->Get(L"initOnCPUOnly");
|
||||
m_initOutputRank = configp->Get(L"initOutputRank");
|
||||
m_initOnCPUOnly = configp->Get(L"initOnCPUOnly");
|
||||
}
|
||||
else if (initString == L"zero")
|
||||
{
|
||||
|
@ -155,6 +161,7 @@ void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString,
|
|||
m_initString = initString;
|
||||
m_randomSeed = randomSeed;
|
||||
m_initValueScale = initValue;
|
||||
m_initOutputRank = 1; // default. NDL (deprecated) cannot specify a different value.
|
||||
m_initOnCPUOnly = initOnCPUOnly;
|
||||
}
|
||||
else if (initString == L"fixedValue") // from constant value
|
||||
|
@ -200,23 +207,30 @@ template <class ElemType>
|
|||
void LearnableParameter<ElemType>::InitRandom(const std::wstring& type,
|
||||
const unsigned long randomSeed,
|
||||
const ElemType initValueScale,
|
||||
bool initOnCPUOnly)
|
||||
const int initOutputRank,
|
||||
const bool initOnCPUOnly)
|
||||
{
|
||||
// fprintf(stderr, "%d x %d: %d %ls\n", (int)GetNumRows(), (int)GetNumCols(), (int)randomSeed, NodeName().c_str());
|
||||
|
||||
let& sampleLayout = GetSampleLayout();
|
||||
#if 1 // this more complex version is needed to repro test cases generated with an older version
|
||||
auto& value = sampleLayout.GetRank() > 2 ? Value() : ValueAsMatrix();
|
||||
#else
|
||||
auto& value = Value();
|
||||
#endif
|
||||
|
||||
let numElements = sampleLayout.GetNumElements();
|
||||
if (numElements == 0)
|
||||
return;
|
||||
// We assume that the matrix row dimension is the output dimension. This is wrong in case of ND biases, convolution filters, and BatchNorm.
|
||||
size_t fanIn = value.GetNumCols(); // fan-in
|
||||
size_t fanOut = numElements / fanIn; // remaining dimensions
|
||||
// determine fan-in and fan-out
|
||||
// This is controlled by initOutputRank.
|
||||
// For a normal matrix [I x J], fanOut = I, fanIn = J=inDim --> initOutputRank = +1
|
||||
// For a convolution kernel [w x h x C x K], fanOut = K, fanIn = w*h*C. --> initOutputRank = -1, meaning count from back
|
||||
if (abs(initOutputRank) > sampleLayout.GetRank())
|
||||
InvalidArgument("InitRandom: initOutputRank=%d exceeds sampleLayout rank %d", initOutputRank, (int)sampleLayout.GetRank());
|
||||
// fanIn is determined by multiplying a range of dimensions:
|
||||
// - initOutputRank >= 0: [ initOutputRank, rank )
|
||||
// - initOutputRank < 0: [ 0, rank-abs(initOutputRank) )
|
||||
let inDimsBegin = (initOutputRank >= 0) ? (size_t)initOutputRank : 0;
|
||||
let inDimsEnd = (initOutputRank >= 0) ? sampleLayout.GetRank() : (size_t)((int)sampleLayout.GetRank() + initOutputRank);
|
||||
size_t fanIn = 1;
|
||||
for (size_t k = inDimsBegin; k < inDimsEnd; k++)
|
||||
fanIn *= sampleLayout[k];
|
||||
let fanOut = numElements / fanIn; // remaining dimensions
|
||||
let opts = ParseRandomizationType(type, fanOut, fanIn);
|
||||
let isUniform = opts.first;
|
||||
ElemType range = (ElemType)opts.second;
|
||||
|
@ -224,14 +238,16 @@ void LearnableParameter<ElemType>::InitRandom(const std::wstring& type,
|
|||
LogicError("InitRandom: Invalid initialization type '%ls'", type.c_str());
|
||||
|
||||
// the random seed offset is set via the "randomSeedOffset" parameter in config
|
||||
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, range=%f*%f, onCPU=%s).\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(), (int)m_randomSeed, range, m_initValueScale, m_initOnCPUOnly ? "true" : "false");
|
||||
fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f*%f, onCPU=%s).\n",
|
||||
NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(),
|
||||
(int)m_randomSeed, (int)fanOut, (int)fanIn, range, m_initValueScale, m_initOnCPUOnly ? "true" : "false");
|
||||
range *= initValueScale;
|
||||
if (initOnCPUOnly)
|
||||
Value().TransferToDeviceIfNotThere(CPUDEVICE, true);
|
||||
if (isUniform)
|
||||
value.SetUniformRandomValue(-range, range, randomSeed);
|
||||
Value().SetUniformRandomValue(-range, range, randomSeed);
|
||||
else
|
||||
value.SetGaussianRandomValue(0, range, randomSeed);
|
||||
Value().SetGaussianRandomValue(0, range, randomSeed);
|
||||
if (initOnCPUOnly)
|
||||
Value().TransferToDeviceIfNotThere(m_deviceId, true);
|
||||
}
|
||||
|
@ -365,6 +381,7 @@ template <class ElemType>
|
|||
node->m_initString = m_initString;
|
||||
node->m_randomSeed = m_randomSeed;
|
||||
node->m_initValueScale = m_initValueScale;
|
||||
node->m_initOutputRank = m_initOutputRank;
|
||||
node->m_initOnCPUOnly = m_initOnCPUOnly;
|
||||
node->m_initValue = m_initValue;
|
||||
}
|
||||
|
@ -439,7 +456,7 @@ void LearnableParameter<ElemType>::LazyInitParameters()
|
|||
}
|
||||
else if (ParseRandomizationType(m_initString).second != 0)
|
||||
{
|
||||
InitRandom(m_initString, m_randomSeed, m_initValueScale, m_initOnCPUOnly);
|
||||
InitRandom(m_initString, m_randomSeed, m_initValueScale, m_initOutputRank, m_initOnCPUOnly);
|
||||
}
|
||||
else
|
||||
LogicError("LearnableParameter: Invalid value of m_initString '%ls' for deferred initialization for %ls.", m_initString.c_str(), NodeDescription().c_str());
|
||||
|
|
|
@ -62,7 +62,7 @@ public:
|
|||
private:
|
||||
// initialize with random numbers
|
||||
// If 'initOnCPUOnly' then always init on CPU, making initialization consistent across both (for testing).
|
||||
void InitRandom(const std::wstring& type, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly);
|
||||
void InitRandom(const std::wstring& type, const unsigned long randomSeed, const ElemType initValueScale, const int initOutputRank, const bool initOnCPUOnly);
|
||||
|
||||
// helper to initialize from a matrix read from a text file or a string literal
|
||||
void InitFromArray(const std::vector<ElemType>& array, size_t numRows, size_t numCols);
|
||||
|
@ -103,6 +103,7 @@ private:
|
|||
std::wstring m_initString; // if non-empty then deferred initialization is needed. Gets cleared upon completion of deferred init.
|
||||
unsigned long m_randomSeed;
|
||||
ElemType m_initValueScale;
|
||||
int m_initOutputRank;
|
||||
bool m_initOnCPUOnly;
|
||||
ElemType m_initValue;
|
||||
};
|
||||
|
|
Загрузка…
Ссылка в новой задаче