new option initOutputRank to LearnableParameter to allow to control how fanIn and fanOut for random init are determined

2016-08-19 19:04:43 -07:00 · 2016-08-19 19:04:43 -07:00 · f24b4481ca
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -171,7 +171,7 @@ LayerNormalizationLayer {dim = BS.Constants.None, initScale = 1, initBias = 0} =
 }.apply

 # StabilizerLayer -- create a scalar stabilizer [J. Droppo, 2014 -- TODO: get the reference]
-StabilizerLayer {} =
+StabilizerLayer{} =
 {
    # BUGBUG: Calling f(x) twice will create a second set of parameters. Needs to refactor Stabilize() for this.
    apply (x) = BS.Parameters.Stabilize (x)
@ -180,7 +180,13 @@ StabilizerLayer {} =
 # FeatureMVNLayer -- create a corpus-level feature-normalization layer
 # This can only be applied to features. Statistics are not shared across invocations,
 # which is semantically OK because the values are the same. However, it is not efficient.
-FeatureMVNLayer {} = MeanVarNorm
+FeatureMVNLayer{} = MeanVarNorm
+
+# LogPriorLayer -- create a corpus-level label-prior layer
+# This can only be applied to labels. Statistics are not shared across invocations,
+# which is semantically OK because the values are the same. However, it is not efficient.
+# TODO: document on Wiki
+LogPriorLayer{} = LogPrior

 # Layers that exist in other tools that we will not have:
 # FlattenLayer{}: Not needed since DenseLayer() can handle tensors just fine.
@ -285,7 +291,7 @@ CNTK2 = [
    // TODO: The API for Parameter is different in current 2.0 design, getting a constant as input for the initial values. 
    // This needs to be fixed to follow the way the Constant() is exposed in Python
    // Making this an internal node with "_" until we agree on the final interface:
-    _Parameter(shape, value = 0, initValue = '', learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]
+    _Parameter(shape, value = 0, initValue = '', learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initOutputRank = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]

    // 3. Shape operations
    // Changes: NewReshape -> Reshape, input -> _, dims -> shape
@ -395,12 +401,12 @@ CNTK2 = [
 #  - initFromLiteral="..." (deprecated) --> parse a string literal (obsolete with value=array form)
 #  - init="fixedValue", value from 'value'
 # Warning: Current config will behave unexpected if user mistypes 'initValue' as 'value' (which will be ignored, defaulting to "uniform" init)
-Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
+Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; initOutputRank = 1 ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]

 LearnableParameter = Parameter  // deprecated

 # TODO: make Parameter take tensor dims?
-ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
+ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initOutputRank = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
 ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, initFromLiteral = literal, learningRateMultiplier = 0.0)
 # TODO: Deprecate ConstantFromString() in favor of Constant(array expression)
 DynamicAxis(tag='') = new ComputationNode [ operation = 'DynamicAxis' ; /*plus the function args*/  ]
@ -553,16 +559,22 @@ IntDiv(x, y) = new NumericFunction [ what = 'IntDiv' ;  args = (x:y) ]
 # macros from NDL book
 ##############################################################################

+# deprecated--use LinearLayer{} and DenseLayer{} instead
 BFF(in, rows, cols) = [ B = Parameter(rows, 1, initValue = 0) ; W = Parameter(rows, cols) ; z = W*in+B ] 
 SBFF(in, rows, cols) = [ Eh = Sigmoid(BFF(in, rows, cols).z) ] 
+
+# deprecated--use FeatureMVNLayer{} instead
 MeanVarNorm(feat) = PerDimMeanVarNormalization(feat, Mean(feat), InvStdDev(feat)) 
+
+# deprecated--use LogPriorLayer{} instead
 LogPrior(labels) = Log(Mean(labels)) 

 # specify one of these two for initialization:
 #  - init = "uniform"|"gaussian"
 #  - embeddingFile = PATHNAME
+# deprecated--use EmbeddingLayer{} instead
 Embedding (embeddingDim, input, inputDim=input.dim, initFrom=''/*|fromFile|gaussian|uniform*/, embeddingPath = '', sparseInput = false, learningRateWeight = 0.0) = [
-    embedding = Transpose (LearnableParameter (inputDim, embeddingDim, learningRateMultiplier = learningRateWeight, init = initFrom, initFromFilePath = embeddingPath))
+    embedding = Transpose (Parameter (inputDim, embeddingDim, learningRateMultiplier = learningRateWeight, init = initFrom, initFromFilePath = embeddingPath))
    lookup = if sparseInput then embedding * input
             else GatherPacked (input, embedding)
 ].lookup
--- a/Source/ComputationNetworkLib/InputAndParamNodes.cpp
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.cpp
@ -41,6 +41,11 @@ static pair<bool/*uniform*/, double/*stddev or range*/> ParseRandomizationType(c
 //  - init="fixedValue",  value from 'value'            --deprecated in favor of just specifying initValue
 //  - init="fromFile",    value from 'initFromFilePath' --deprecated in favor of just specifying 'initFromFilePath'
 //  - init="fromLiteral", value from 'initFromLiteral'  --deprecated in favor of initValue=array expression
+// Random initialization takes an additional optional parameter initOutputRank, default 1.
+// All dimensions that are not amongst the first 'initOutputRank' are considered inputs.
+// This is necessary e.g. for convolution.
+// 'initOutputRank' can also be negative to denote output dims on the right, to cater to the needs
+// of convolution kernels where the output rank is the right-most axis (initOutputRank=-1).
 // The forms that infer the dimensions have different BrainScript names. TODO: need one for fromFile
 // TODO: All forms that require specified dimensions but contain zeroes (to be updated by graph)
 //       will need to do deferred initialization, or have a way to repeat it.
@ -91,7 +96,8 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
        int forcedRandomSeed = configp->Get(L"randomSeed"); // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
        m_randomSeed = forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed;
        m_initValueScale = configp->Get(L"initValueScale");
-        m_initOnCPUOnly = configp->Get(L"initOnCPUOnly");
+        m_initOutputRank = configp->Get(L"initOutputRank");
+        m_initOnCPUOnly  = configp->Get(L"initOnCPUOnly");
    }
    else if (initString == L"zero")
    {
@ -155,6 +161,7 @@ void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString,
        m_initString = initString;
        m_randomSeed = randomSeed;
        m_initValueScale = initValue;
+        m_initOutputRank = 1; // default. NDL (deprecated) cannot specify a different value.
        m_initOnCPUOnly = initOnCPUOnly;
    }
    else if (initString == L"fixedValue") // from constant value
@ -200,23 +207,30 @@ template <class ElemType>
 void LearnableParameter<ElemType>::InitRandom(const std::wstring& type,
                                              const unsigned long randomSeed,
                                              const ElemType initValueScale,
-                                              bool initOnCPUOnly)
+                                              const int initOutputRank,
+                                              const bool initOnCPUOnly)
 {
    // fprintf(stderr, "%d x %d: %d  %ls\n", (int)GetNumRows(), (int)GetNumCols(), (int)randomSeed, NodeName().c_str());

    let& sampleLayout = GetSampleLayout();
-#if 1   // this more complex version is needed to repro test cases generated with an older version
-    auto& value = sampleLayout.GetRank() > 2 ? Value() : ValueAsMatrix();
-#else
-    auto& value = Value();
-#endif
-
    let numElements = sampleLayout.GetNumElements();
    if (numElements == 0)
        return;
-    // We assume that the matrix row dimension is the output dimension. This is wrong in case of ND biases, convolution filters, and BatchNorm.
-    size_t fanIn = value.GetNumCols();   // fan-in
-    size_t fanOut = numElements / fanIn; // remaining dimensions
+    // determine fan-in and fan-out
+    // This is controlled by initOutputRank.
+    // For a normal matrix [I x J], fanOut = I, fanIn = J=inDim --> initOutputRank = +1
+    // For a convolution kernel [w x h x C x K], fanOut = K, fanIn = w*h*C. --> initOutputRank = -1, meaning count from back
+    if (abs(initOutputRank) > sampleLayout.GetRank())
+        InvalidArgument("InitRandom: initOutputRank=%d exceeds sampleLayout rank %d", initOutputRank, (int)sampleLayout.GetRank());
+    // fanIn is determined by multiplying a range of dimensions:
+    //  - initOutputRank >= 0: [ initOutputRank, rank )
+    //  - initOutputRank <  0: [ 0, rank-abs(initOutputRank) )
+    let inDimsBegin = (initOutputRank >= 0) ? (size_t)initOutputRank : 0;
+    let inDimsEnd   = (initOutputRank >= 0) ? sampleLayout.GetRank() : (size_t)((int)sampleLayout.GetRank() + initOutputRank);
+    size_t fanIn = 1;
+    for (size_t k = inDimsBegin; k < inDimsEnd; k++)
+        fanIn *= sampleLayout[k];
+    let fanOut = numElements / fanIn; // remaining dimensions
    let opts = ParseRandomizationType(type, fanOut, fanIn);
    let isUniform = opts.first;
    ElemType range = (ElemType)opts.second;
@ -224,14 +238,16 @@ void LearnableParameter<ElemType>::InitRandom(const std::wstring& type,
        LogicError("InitRandom: Invalid initialization type '%ls'", type.c_str());

    // the random seed offset is set via the "randomSeedOffset" parameter in config
-    fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, range=%f*%f, onCPU=%s).\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(), (int)m_randomSeed, range, m_initValueScale, m_initOnCPUOnly ? "true" : "false");
+    fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f*%f, onCPU=%s).\n",
+            NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(),
+            (int)m_randomSeed, (int)fanOut, (int)fanIn, range, m_initValueScale, m_initOnCPUOnly ? "true" : "false");
    range *= initValueScale;
    if (initOnCPUOnly)
        Value().TransferToDeviceIfNotThere(CPUDEVICE, true);
    if (isUniform)
-        value.SetUniformRandomValue(-range, range, randomSeed);
+        Value().SetUniformRandomValue(-range, range, randomSeed);
    else
-        value.SetGaussianRandomValue(0, range, randomSeed);
+        Value().SetGaussianRandomValue(0, range, randomSeed);
    if (initOnCPUOnly)
        Value().TransferToDeviceIfNotThere(m_deviceId, true);
 }
@ -365,6 +381,7 @@ template <class ElemType>
        node->m_initString     = m_initString;
        node->m_randomSeed     = m_randomSeed;
        node->m_initValueScale = m_initValueScale;
+        node->m_initOutputRank = m_initOutputRank;
        node->m_initOnCPUOnly  = m_initOnCPUOnly;
        node->m_initValue      = m_initValue;
    }
@ -439,7 +456,7 @@ void LearnableParameter<ElemType>::LazyInitParameters()
    }
    else if (ParseRandomizationType(m_initString).second != 0)
    {
-        InitRandom(m_initString, m_randomSeed, m_initValueScale, m_initOnCPUOnly);
+        InitRandom(m_initString, m_randomSeed, m_initValueScale, m_initOutputRank, m_initOnCPUOnly);
    }
    else
        LogicError("LearnableParameter: Invalid value of m_initString '%ls' for deferred initialization for %ls.", m_initString.c_str(), NodeDescription().c_str());
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@ -62,7 +62,7 @@ public:
 private:
    // initialize with random numbers
    // If 'initOnCPUOnly' then always init on CPU, making initialization consistent across both (for testing).
-    void InitRandom(const std::wstring& type, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly);
+    void InitRandom(const std::wstring& type, const unsigned long randomSeed, const ElemType initValueScale, const int initOutputRank, const bool initOnCPUOnly);

    // helper to initialize from a matrix read from a text file or a string literal
    void InitFromArray(const std::vector<ElemType>& array, size_t numRows, size_t numCols);
@ -103,6 +103,7 @@ private:
    std::wstring m_initString; // if non-empty then deferred initialization is needed. Gets cleared upon completion of deferred init.
    unsigned long m_randomSeed;
    ElemType m_initValueScale;
+    int m_initOutputRank;
    bool m_initOnCPUOnly;
    ElemType m_initValue;
 };