printing matrix info no longer requires ElemType and has streamlined output;

added parameter dimension inference to BatchNormalizationNode
2016-08-10 10:47:25 -07:00 · 2016-08-10 10:47:25 -07:00 · 03143a06f7
--- a/Examples/Image/Miscellaneous/CIFAR-10/TutorialImage.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/TutorialImage.cntk
@ -1,7 +1,7 @@
 # Simple CIFAR-10 convnet

-command = TrainConvNet:Eval
-#command = TrainConvNetWithBN:Eval
+#command = TrainConvNet:Eval
+command = TrainConvNetWithBN:Eval

 makeMode = false ; traceLevel = 1 ; deviceId = 0

@ -87,16 +87,16 @@ TrainConvNetWithBN = [
        model = Sequential (
            Subtract128 :
            ConvolutionalLayer {32, (5:5), bias = false, init = "gaussian", initValueScale = 0.0043} :
-              BatchNormalizationLayer {outDim = 32, spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
+              BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
                MaxPoolingLayer {(3:3), stride = (2:2)} :
            ConvolutionalLayer {32, (5:5), bias = false, init = "gaussian", initValueScale = 1.414} :
-              BatchNormalizationLayer {outDim = 32, spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
+              BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
                MaxPoolingLayer {(3:3), stride = (2:2)} :
            ConvolutionalLayer {64, (5:5), bias = false, init = "gaussian", initValueScale = 1.414} :
-              BatchNormalizationLayer {outDim = 64, spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
+              BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
                MaxPoolingLayer {(3:3), stride = (2:2)} :
            LinearLayer {64, bias = false, init = "gaussian", initValueScale = 12} :
-              BatchNormalizationLayer {outDim = 64, normalizationTimeConstant = 4096} : ReLU :
+              BatchNormalizationLayer {normalizationTimeConstant = 4096} : ReLU :
            LinearLayer {labelDim, init = "gaussian", initValueScale = 1.5}
        )

--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -138,14 +138,11 @@ DelayLayer {T=1, defaultHiddenActivation=0} =
 # BatchNormalizationLayer -- create a batch-normalization layer
 BatchNormalizationLayer {spatialRank = 0,  # reduce over these dims. E.g. 2 to reduce over (w,h) in a [W x H x C]-shaped input
                         initialScale = 1,
-                         outDim = BS.Constants.None,   # TODO: must be specified for now
                         normalizationTimeConstant = 0, blendTimeConstant = 0,
                         epsilon = 0.00001, useCntkEngine = true} =
 {
-    normShape =
-        if BS.Constants.IsNone (outDim) then Fail ("BatchNormalizationLayer: Currently, outDim= is required.")
-            #_ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
-        else (outDim : 1)   # this is how it is currently parameterized. Clean this up to enable inference.
+    #normShape = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
+    normShape = (0:1)  # TODO: Update this once we support broadcasting-style parameters.
    scale        = ParameterTensor {normShape, initValue = initialScale}
    bias         = ParameterTensor {normShape, initValue = 0}
    runMean      = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -190,7 +190,7 @@ public:
    void AllocateAllMatrices(const std::vector<ComputationNodeBasePtr>& evalRootNodes, const std::vector<ComputationNodeBasePtr>& outValueRootNodes, ComputationNodeBasePtr trainRootNode);

 private:
-    template <class ElemType> void PrintMemorySharingStructure(const std::vector<ComputationNodeBasePtr>& nodes);
+    void PrintMemorySharingStructure(const std::vector<ComputationNodeBasePtr>& nodes);
    void ReleaseMatricesAfterEvalForChildren(ComputationNodeBasePtr n, std::unordered_map<ComputationNodeBasePtr, int>& parentCount);
    void AllocateGradientMatricesForInputs(ComputationNodeBasePtr parentNode);

--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -803,35 +803,51 @@ void ComputationNetwork::MarkValueNonSharableNodes()
    }
 }

-template <class ElemType>
-void ComputationNetwork::PrintMemorySharingStructure(const std::vector<ComputationNodeBasePtr>& nodes)
+// print memory-sharing information to log
+void ComputationNetwork::PrintMemorySharingStructure(const vector<ComputationNodeBasePtr>& nodes)
 {
-    std::map <const Matrix<ElemType>*, std::set<wstring>> memSharingStructure;
-    for (auto& n : nodes)
+    map <const MatrixBase*, set<wstring>> memSharingStructure;
+    size_t numMatrices = 0;
+    for (const auto& node : nodes)
    {
-        ComputationNode<ElemType>* node = n->As<ComputationNode<ElemType>>();
-        std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> matrixInfo = node->GetMatrixInfo();
-        for (const auto&item : matrixInfo)
+        set<pair<const MatrixBase*, wstring>> matrixInfo = node->GetMatrixInfo();
+        for (const auto&item : matrixInfo) // {value} or {value, gradient}
        {
-            const Matrix<ElemType>* matrix = item.first;
-            if (memSharingStructure.find(matrix) == memSharingStructure.end())
-                memSharingStructure.insert(std::pair<const Matrix<ElemType>*, std::set<wstring>>(matrix, std::set<wstring>()));
-
-            std::set<wstring>& s = memSharingStructure[matrix];
-            s.insert(item.second);
+            memSharingStructure[item.first].insert(item.second);
+            numMatrices++;
        }
    }

-    fprintf(stderr, "\nMemory Sharing Structure:\n\n");
+    // count shared/unshared
+    size_t numShared = 0;
+    size_t numUnshared = 0;
    for (const auto& item : memSharingStructure)
    {
-        const std::set<wstring>& s = item.second;
-        fprintf(stderr, "%p: {", item.first);
-        for (const auto& memShareInfo: s)
+        if (item.second.size() < 2) // only print actually shared matrices
+            numUnshared++;
+        else
+            numShared++;
+    }
+
+    fprintf(stderr, "\nMemory Sharing: Out of %d matrices, %d are shared as %d, and %d are not shared.\n\n", (int)numMatrices, (int)(numMatrices - numUnshared), (int)numShared, (int)numUnshared);
+    for (const auto& item : memSharingStructure)
+    {
+        if (item.second.size() < 2) // only print actually shared matrices
+            continue;
+        // Format:
+        // { node1
+        //   node2 }
+        // { node3
+        //   node4
+        //   node5 }
+        // where unshared nodes are not printed.
+        const char* delim = "\t{ ";
+        for (const auto& memShareInfo : item.second)
        {
-            fprintf(stderr, "[%ls] ", memShareInfo.c_str());
+            fprintf(stderr, "%s%ls", delim, memShareInfo.c_str());
+            delim = "\n\t  ";
        }
-        fprintf(stderr, "}\n");
+        fprintf(stderr, " }\n");
    }
    fprintf(stderr, "\n");
 }
@ -986,17 +1002,8 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa

    m_areMatricesAllocated = true;

-    //print the memory sharing structure
-    std::vector<ComputationNodeBasePtr> allNodes = GetAllNodes();
-    if (allNodes.size() == 0)
-        LogicError("Network has no computation node.");
-
-    if (allNodes[0]->Is<ComputationNode<float>>())
-        PrintMemorySharingStructure<float>(allNodes);
-    else if (allNodes[0]->Is<ComputationNode<double>>())
-        PrintMemorySharingStructure<double>(allNodes);
-    else
-        LogicError("Unexpected node precision type.");
+    // print the memory sharing structure
+    PrintMemorySharingStructure(GetAllNodes());
 }

 void ComputationNetwork::ReleaseMatricesAfterEvalForChildren(ComputationNodeBasePtr n, std::unordered_map<ComputationNodeBasePtr, int>& parentCount)
@ -1009,4 +1016,5 @@ void ComputationNetwork::ReleaseMatricesAfterEvalForChildren(ComputationNodeBase
            pNode->ReleaseMatricesAfterForwardProp(m_matrixPool);
    }
 }
-} } }
+
+}}}
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -645,6 +645,8 @@ public:
    ComputationEnvironmentPtr GetEnvironmentPtr() const { return m_environment; }
    void SetEnvironment(ComputationEnvironmentPtr environment) { m_environment = environment; }

+    virtual std::set<std::pair<const MatrixBase*, std::wstring>> GetMatrixInfo() const = 0; // to be defined by <ElemType> version
+
    // -----------------------------------------------------------------------
    // validation
    // -----------------------------------------------------------------------
@ -1462,13 +1464,14 @@ public:
    // memory sharing
    // -----------------------------------------------------------------------

-    //this function is for displaying memeory sharing information
-    //TODO: customize this function for all nodes that uses temp internal matrices.
-    virtual std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> GetMatrixInfo()
+    // helper function for formatting memory sharing information
+    // TODO: customize this function for all nodes that uses temp internal matrices.
+    virtual std::set<std::pair<const MatrixBase*, std::wstring>> GetMatrixInfo() const override
    {
-        std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> matrixInfo;
-        matrixInfo.insert(make_pair(&Value(),    NodeName() + L" Value"    + msra::strfun::utf16(ShapeDescription())));
-        matrixInfo.insert(make_pair(&Gradient(), NodeName() + L" Gradient" + msra::strfun::utf16(ShapeDescription())));
+        std::set<std::pair<const MatrixBase*, std::wstring>> matrixInfo;
+        matrixInfo.insert    (make_pair(&Value(),    NodeName() + L" : " + msra::strfun::utf16(ShapeDescription())));
+        if (NeedsGradient())
+            matrixInfo.insert(make_pair(&Gradient(), NodeName() + L" : " + msra::strfun::utf16(ShapeDescription()) + L" (gradient)"));
        return matrixInfo;
    }

@ -1868,6 +1871,7 @@ public:
    virtual bool RequiresPreCompute() const override { return false; } // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
    virtual std::string FormatOperationPrototype(const std::string& extraArgs) const override { return ""; }
    virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const override {}
+    virtual std::set<std::pair<const MatrixBase*, std::wstring>> GetMatrixInfo() const override { NOT_IMPLEMENTED; }

 protected: public:                                     // needed in ComputationNetwork::FindInRecurrentLoops(), which really should be part of SEQTraversalFlowControlNode
    std::vector<ComputationNodeBasePtr> m_nestedNodes; // nodes tucked away in this node, in evaluation order
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -454,7 +454,7 @@ public:
                //LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
                //           Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
                LogicError("Convolution weight matrix %ls should have dimension [(filter shape) x (input channels) x (output channels)]",
-                           Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
+                           Input(0)->NodeName().c_str());
            }
        }
    }
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -1556,6 +1556,8 @@ template class DropoutNode<double>;
 // * scale is a LearnableParameter that stores scale vector (gamma term in the equation above).
 // * bias is a LearnableParameter that stores bias vector (beta term). scale and bias must have the same dimensions which must be equal 
 //      to the input dimensions in case of spatial = false or number of output convolution feature maps in case of spatial = true.
+//      BUGBUG: Number of convolution feature maps are considered the last axis of the input.
+//              More correct would be to infer that from broadcasting dimensions (spatial mode is broadcasting).
 // * runMean is the running mean which is used during evaluation phase and might be used during training as well.
 //      It is represented as a LearnableParameter with the same dimensions as scale and bias.
 // * runInvStdDev is the running inverse square root of variance(so InvStdDev = 1 / sqrt(var + epsilon)).
@ -1825,10 +1827,23 @@ public:

        SetDims(Input(0));

-        // BUGBUG: Parameter dimensions are totally wrong. E.g. a valid spatial bias for [15 x 15 x 32] is currently [32 x 1].
-        //         The correct bias shape should be [1 x 1 x 32].
-#if 0   // This does not work.
+        const auto& inputLayout = Input(0)->GetSampleLayout();
+
        // infer dimensions of learnable parameters
+        // BUGBUG: Parameter dimensions are totally wrong. E.g. a valid spatial bias for [15 x 15 x 32] is currently [32 x 1].
+        //         The correct bias shape should be [1 x 1 x 32]. That can be specified but leads to different results for unknown reasons.
+        //         Until this has been corrected, we need a workaround that infers the wrong dimensions.
+#if 1   // Workaround for today's definition: Trigger on [0 x 1] and infer that 0 as the total # elements needed.
+        for (size_t i = 1; i < GetNumInputs(); i++)
+        {
+            auto paramLayout = Input(i)->GetSampleLayout();
+            if (paramLayout.GetRank() == 2 && paramLayout[0] == 0 && paramLayout[1] == 1 && inputLayout.GetNumElements() > 0) // [0 x 1]
+            {
+                size_t total = m_spatial ? inputLayout.GetDims().back() : inputLayout.GetNumElements();
+                Input(i)->ValidateInferInputDimsFrom(TensorShape(total, 1));
+            }
+        }
+#else
        // These are here only inferred like for elementwise operations. We must check more.
        ValidateNaryZip(isFinalValidationPass, /*allowBroadcast=*/ true, GetNumInputs());
 #endif
@ -1836,7 +1851,6 @@ public:
        if (isFinalValidationPass)
        {
            // check inputs
-            auto inputLayout = Input(0)->GetSampleLayout();
            for (size_t i = 1; i < GetNumInputs(); i++)
            {
                if (Input(i)->HasMBLayout())
@ -1844,7 +1858,7 @@ public:
                auto paramLayout = Input(i)->GetSampleLayout();
                if (paramLayout != Input(1)->GetSampleLayout())
                    InvalidArgument("%ls: Input[%d] has a layout different from Input[1]. All must be identical.", NodeDescription().c_str(), (int)i);
-#if 0   // This does not work. E.g. a valid spatial bias for [15 x 15 x 32] is currently [32 x 1], which is totally wrong.
+#if 0           // BUGBUG: For this to work, parameter shapes must be correct (cf. comment above on inference).
                if (paramLayout.GetRank() > inputLayout.GetRank())
                    InvalidArgument("%ls: Input[%d] has a tensor rank greated than the data input.", NodeDescription().c_str(), (int)i);
                for (size_t k = 0; k < paramLayout.size(); k++)
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -276,7 +276,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
            numNeedsGradient++;
    }
    fprintf(stderr, "\n");
-    LOGPRINTF(stderr, "Training %.0f parameters in %d out of %d parameter tensors and %d nodes with gradient:\n",
+    LOGPRINTF(stderr, "Training %.0f parameters in %d out of %d parameter tensors and %d nodes with gradient:\n\n",
              (double)numParameters, (int)nodesToUpdateDescriptions.size(), (int)learnableNodes.size(), (int)numNeedsGradient);
    for (let nodeDescription : nodesToUpdateDescriptions)
    {