printing matrix info no longer requires ElemType and has streamlined output;

added parameter dimension inference to BatchNormalizationNode
This commit is contained in:
Frank Seide 2016-08-10 10:47:25 -07:00
Родитель d071351c32
Коммит 03143a06f7
8 изменённых файлов: 79 добавлений и 56 удалений

Просмотреть файл

@ -1,7 +1,7 @@
# Simple CIFAR-10 convnet
command = TrainConvNet:Eval
#command = TrainConvNetWithBN:Eval
#command = TrainConvNet:Eval
command = TrainConvNetWithBN:Eval
makeMode = false ; traceLevel = 1 ; deviceId = 0
@ -87,16 +87,16 @@ TrainConvNetWithBN = [
model = Sequential (
Subtract128 :
ConvolutionalLayer {32, (5:5), bias = false, init = "gaussian", initValueScale = 0.0043} :
BatchNormalizationLayer {outDim = 32, spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
MaxPoolingLayer {(3:3), stride = (2:2)} :
ConvolutionalLayer {32, (5:5), bias = false, init = "gaussian", initValueScale = 1.414} :
BatchNormalizationLayer {outDim = 32, spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
MaxPoolingLayer {(3:3), stride = (2:2)} :
ConvolutionalLayer {64, (5:5), bias = false, init = "gaussian", initValueScale = 1.414} :
BatchNormalizationLayer {outDim = 64, spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
MaxPoolingLayer {(3:3), stride = (2:2)} :
LinearLayer {64, bias = false, init = "gaussian", initValueScale = 12} :
BatchNormalizationLayer {outDim = 64, normalizationTimeConstant = 4096} : ReLU :
BatchNormalizationLayer {normalizationTimeConstant = 4096} : ReLU :
LinearLayer {labelDim, init = "gaussian", initValueScale = 1.5}
)

Просмотреть файл

@ -138,14 +138,11 @@ DelayLayer {T=1, defaultHiddenActivation=0} =
# BatchNormalizationLayer -- create a batch-normalization layer
BatchNormalizationLayer {spatialRank = 0, # reduce over these dims. E.g. 2 to reduce over (w,h) in a [W x H x C]-shaped input
initialScale = 1,
outDim = BS.Constants.None, # TODO: must be specified for now
normalizationTimeConstant = 0, blendTimeConstant = 0,
epsilon = 0.00001, useCntkEngine = true} =
{
normShape =
if BS.Constants.IsNone (outDim) then Fail ("BatchNormalizationLayer: Currently, outDim= is required.")
#_ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
else (outDim : 1) # this is how it is currently parameterized. Clean this up to enable inference.
#normShape = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
normShape = (0:1) # TODO: Update this once we support broadcasting-style parameters.
scale = ParameterTensor {normShape, initValue = initialScale}
bias = ParameterTensor {normShape, initValue = 0}
runMean = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently

Просмотреть файл

@ -190,7 +190,7 @@ public:
void AllocateAllMatrices(const std::vector<ComputationNodeBasePtr>& evalRootNodes, const std::vector<ComputationNodeBasePtr>& outValueRootNodes, ComputationNodeBasePtr trainRootNode);
private:
template <class ElemType> void PrintMemorySharingStructure(const std::vector<ComputationNodeBasePtr>& nodes);
void PrintMemorySharingStructure(const std::vector<ComputationNodeBasePtr>& nodes);
void ReleaseMatricesAfterEvalForChildren(ComputationNodeBasePtr n, std::unordered_map<ComputationNodeBasePtr, int>& parentCount);
void AllocateGradientMatricesForInputs(ComputationNodeBasePtr parentNode);

Просмотреть файл

@ -803,33 +803,49 @@ void ComputationNetwork::MarkValueNonSharableNodes()
}
}
template <class ElemType>
void ComputationNetwork::PrintMemorySharingStructure(const std::vector<ComputationNodeBasePtr>& nodes)
// print memory-sharing information to log
void ComputationNetwork::PrintMemorySharingStructure(const vector<ComputationNodeBasePtr>& nodes)
{
std::map <const Matrix<ElemType>*, std::set<wstring>> memSharingStructure;
for (auto& n : nodes)
map <const MatrixBase*, set<wstring>> memSharingStructure;
size_t numMatrices = 0;
for (const auto& node : nodes)
{
ComputationNode<ElemType>* node = n->As<ComputationNode<ElemType>>();
std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> matrixInfo = node->GetMatrixInfo();
for (const auto&item : matrixInfo)
set<pair<const MatrixBase*, wstring>> matrixInfo = node->GetMatrixInfo();
for (const auto&item : matrixInfo) // {value} or {value, gradient}
{
const Matrix<ElemType>* matrix = item.first;
if (memSharingStructure.find(matrix) == memSharingStructure.end())
memSharingStructure.insert(std::pair<const Matrix<ElemType>*, std::set<wstring>>(matrix, std::set<wstring>()));
std::set<wstring>& s = memSharingStructure[matrix];
s.insert(item.second);
memSharingStructure[item.first].insert(item.second);
numMatrices++;
}
}
fprintf(stderr, "\nMemory Sharing Structure:\n\n");
// count shared/unshared
size_t numShared = 0;
size_t numUnshared = 0;
for (const auto& item : memSharingStructure)
{
const std::set<wstring>& s = item.second;
fprintf(stderr, "%p: {", item.first);
for (const auto& memShareInfo: s)
if (item.second.size() < 2) // only print actually shared matrices
numUnshared++;
else
numShared++;
}
fprintf(stderr, "\nMemory Sharing: Out of %d matrices, %d are shared as %d, and %d are not shared.\n\n", (int)numMatrices, (int)(numMatrices - numUnshared), (int)numShared, (int)numUnshared);
for (const auto& item : memSharingStructure)
{
fprintf(stderr, "[%ls] ", memShareInfo.c_str());
if (item.second.size() < 2) // only print actually shared matrices
continue;
// Format:
// { node1
// node2 }
// { node3
// node4
// node5 }
// where unshared nodes are not printed.
const char* delim = "\t{ ";
for (const auto& memShareInfo : item.second)
{
fprintf(stderr, "%s%ls", delim, memShareInfo.c_str());
delim = "\n\t ";
}
fprintf(stderr, " }\n");
}
@ -987,16 +1003,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
m_areMatricesAllocated = true;
// print the memory sharing structure
std::vector<ComputationNodeBasePtr> allNodes = GetAllNodes();
if (allNodes.size() == 0)
LogicError("Network has no computation node.");
if (allNodes[0]->Is<ComputationNode<float>>())
PrintMemorySharingStructure<float>(allNodes);
else if (allNodes[0]->Is<ComputationNode<double>>())
PrintMemorySharingStructure<double>(allNodes);
else
LogicError("Unexpected node precision type.");
PrintMemorySharingStructure(GetAllNodes());
}
void ComputationNetwork::ReleaseMatricesAfterEvalForChildren(ComputationNodeBasePtr n, std::unordered_map<ComputationNodeBasePtr, int>& parentCount)
@ -1009,4 +1016,5 @@ void ComputationNetwork::ReleaseMatricesAfterEvalForChildren(ComputationNodeBase
pNode->ReleaseMatricesAfterForwardProp(m_matrixPool);
}
}
}}}

Просмотреть файл

@ -645,6 +645,8 @@ public:
ComputationEnvironmentPtr GetEnvironmentPtr() const { return m_environment; }
void SetEnvironment(ComputationEnvironmentPtr environment) { m_environment = environment; }
virtual std::set<std::pair<const MatrixBase*, std::wstring>> GetMatrixInfo() const = 0; // to be defined by <ElemType> version
// -----------------------------------------------------------------------
// validation
// -----------------------------------------------------------------------
@ -1462,13 +1464,14 @@ public:
// memory sharing
// -----------------------------------------------------------------------
//this function is for displaying memeory sharing information
// helper function for formatting memory sharing information
// TODO: customize this function for all nodes that uses temp internal matrices.
virtual std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> GetMatrixInfo()
virtual std::set<std::pair<const MatrixBase*, std::wstring>> GetMatrixInfo() const override
{
std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> matrixInfo;
matrixInfo.insert(make_pair(&Value(), NodeName() + L" Value" + msra::strfun::utf16(ShapeDescription())));
matrixInfo.insert(make_pair(&Gradient(), NodeName() + L" Gradient" + msra::strfun::utf16(ShapeDescription())));
std::set<std::pair<const MatrixBase*, std::wstring>> matrixInfo;
matrixInfo.insert (make_pair(&Value(), NodeName() + L" : " + msra::strfun::utf16(ShapeDescription())));
if (NeedsGradient())
matrixInfo.insert(make_pair(&Gradient(), NodeName() + L" : " + msra::strfun::utf16(ShapeDescription()) + L" (gradient)"));
return matrixInfo;
}
@ -1868,6 +1871,7 @@ public:
virtual bool RequiresPreCompute() const override { return false; } // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
virtual std::string FormatOperationPrototype(const std::string& extraArgs) const override { return ""; }
virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const override {}
virtual std::set<std::pair<const MatrixBase*, std::wstring>> GetMatrixInfo() const override { NOT_IMPLEMENTED; }
protected: public: // needed in ComputationNetwork::FindInRecurrentLoops(), which really should be part of SEQTraversalFlowControlNode
std::vector<ComputationNodeBasePtr> m_nestedNodes; // nodes tucked away in this node, in evaluation order

Просмотреть файл

@ -454,7 +454,7 @@ public:
//LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
// Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
LogicError("Convolution weight matrix %ls should have dimension [(filter shape) x (input channels) x (output channels)]",
Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
Input(0)->NodeName().c_str());
}
}
}

Просмотреть файл

@ -1556,6 +1556,8 @@ template class DropoutNode<double>;
// * scale is a LearnableParameter that stores scale vector (gamma term in the equation above).
// * bias is a LearnableParameter that stores bias vector (beta term). scale and bias must have the same dimensions which must be equal
// to the input dimensions in case of spatial = false or number of output convolution feature maps in case of spatial = true.
// BUGBUG: Number of convolution feature maps are considered the last axis of the input.
// More correct would be to infer that from broadcasting dimensions (spatial mode is broadcasting).
// * runMean is the running mean which is used during evaluation phase and might be used during training as well.
// It is represented as a LearnableParameter with the same dimensions as scale and bias.
// * runInvStdDev is the running inverse square root of variance(so InvStdDev = 1 / sqrt(var + epsilon)).
@ -1825,10 +1827,23 @@ public:
SetDims(Input(0));
// BUGBUG: Parameter dimensions are totally wrong. E.g. a valid spatial bias for [15 x 15 x 32] is currently [32 x 1].
// The correct bias shape should be [1 x 1 x 32].
#if 0 // This does not work.
const auto& inputLayout = Input(0)->GetSampleLayout();
// infer dimensions of learnable parameters
// BUGBUG: Parameter dimensions are totally wrong. E.g. a valid spatial bias for [15 x 15 x 32] is currently [32 x 1].
// The correct bias shape should be [1 x 1 x 32]. That can be specified but leads to different results for unknown reasons.
// Until this has been corrected, we need a workaround that infers the wrong dimensions.
#if 1 // Workaround for today's definition: Trigger on [0 x 1] and infer that 0 as the total # elements needed.
for (size_t i = 1; i < GetNumInputs(); i++)
{
auto paramLayout = Input(i)->GetSampleLayout();
if (paramLayout.GetRank() == 2 && paramLayout[0] == 0 && paramLayout[1] == 1 && inputLayout.GetNumElements() > 0) // [0 x 1]
{
size_t total = m_spatial ? inputLayout.GetDims().back() : inputLayout.GetNumElements();
Input(i)->ValidateInferInputDimsFrom(TensorShape(total, 1));
}
}
#else
// These are here only inferred like for elementwise operations. We must check more.
ValidateNaryZip(isFinalValidationPass, /*allowBroadcast=*/ true, GetNumInputs());
#endif
@ -1836,7 +1851,6 @@ public:
if (isFinalValidationPass)
{
// check inputs
auto inputLayout = Input(0)->GetSampleLayout();
for (size_t i = 1; i < GetNumInputs(); i++)
{
if (Input(i)->HasMBLayout())
@ -1844,7 +1858,7 @@ public:
auto paramLayout = Input(i)->GetSampleLayout();
if (paramLayout != Input(1)->GetSampleLayout())
InvalidArgument("%ls: Input[%d] has a layout different from Input[1]. All must be identical.", NodeDescription().c_str(), (int)i);
#if 0 // This does not work. E.g. a valid spatial bias for [15 x 15 x 32] is currently [32 x 1], which is totally wrong.
#if 0 // BUGBUG: For this to work, parameter shapes must be correct (cf. comment above on inference).
if (paramLayout.GetRank() > inputLayout.GetRank())
InvalidArgument("%ls: Input[%d] has a tensor rank greated than the data input.", NodeDescription().c_str(), (int)i);
for (size_t k = 0; k < paramLayout.size(); k++)

Просмотреть файл

@ -276,7 +276,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
numNeedsGradient++;
}
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Training %.0f parameters in %d out of %d parameter tensors and %d nodes with gradient:\n",
LOGPRINTF(stderr, "Training %.0f parameters in %d out of %d parameter tensors and %d nodes with gradient:\n\n",
(double)numParameters, (int)nodesToUpdateDescriptions.size(), (int)learnableNodes.size(), (int)numNeedsGradient);
for (let nodeDescription : nodesToUpdateDescriptions)
{