printing matrix info no longer requires ElemType and has streamlined output;
added parameter dimension inference to BatchNormalizationNode
This commit is contained in:
Родитель
d071351c32
Коммит
03143a06f7
|
@ -1,7 +1,7 @@
|
|||
# Simple CIFAR-10 convnet
|
||||
|
||||
command = TrainConvNet:Eval
|
||||
#command = TrainConvNetWithBN:Eval
|
||||
#command = TrainConvNet:Eval
|
||||
command = TrainConvNetWithBN:Eval
|
||||
|
||||
makeMode = false ; traceLevel = 1 ; deviceId = 0
|
||||
|
||||
|
@ -87,16 +87,16 @@ TrainConvNetWithBN = [
|
|||
model = Sequential (
|
||||
Subtract128 :
|
||||
ConvolutionalLayer {32, (5:5), bias = false, init = "gaussian", initValueScale = 0.0043} :
|
||||
BatchNormalizationLayer {outDim = 32, spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
|
||||
BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
|
||||
MaxPoolingLayer {(3:3), stride = (2:2)} :
|
||||
ConvolutionalLayer {32, (5:5), bias = false, init = "gaussian", initValueScale = 1.414} :
|
||||
BatchNormalizationLayer {outDim = 32, spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
|
||||
BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
|
||||
MaxPoolingLayer {(3:3), stride = (2:2)} :
|
||||
ConvolutionalLayer {64, (5:5), bias = false, init = "gaussian", initValueScale = 1.414} :
|
||||
BatchNormalizationLayer {outDim = 64, spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
|
||||
BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
|
||||
MaxPoolingLayer {(3:3), stride = (2:2)} :
|
||||
LinearLayer {64, bias = false, init = "gaussian", initValueScale = 12} :
|
||||
BatchNormalizationLayer {outDim = 64, normalizationTimeConstant = 4096} : ReLU :
|
||||
BatchNormalizationLayer {normalizationTimeConstant = 4096} : ReLU :
|
||||
LinearLayer {labelDim, init = "gaussian", initValueScale = 1.5}
|
||||
)
|
||||
|
||||
|
|
|
@ -138,14 +138,11 @@ DelayLayer {T=1, defaultHiddenActivation=0} =
|
|||
# BatchNormalizationLayer -- create a batch-normalization layer
|
||||
BatchNormalizationLayer {spatialRank = 0, # reduce over these dims. E.g. 2 to reduce over (w,h) in a [W x H x C]-shaped input
|
||||
initialScale = 1,
|
||||
outDim = BS.Constants.None, # TODO: must be specified for now
|
||||
normalizationTimeConstant = 0, blendTimeConstant = 0,
|
||||
epsilon = 0.00001, useCntkEngine = true} =
|
||||
{
|
||||
normShape =
|
||||
if BS.Constants.IsNone (outDim) then Fail ("BatchNormalizationLayer: Currently, outDim= is required.")
|
||||
#_ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
|
||||
else (outDim : 1) # this is how it is currently parameterized. Clean this up to enable inference.
|
||||
#normShape = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
|
||||
normShape = (0:1) # TODO: Update this once we support broadcasting-style parameters.
|
||||
scale = ParameterTensor {normShape, initValue = initialScale}
|
||||
bias = ParameterTensor {normShape, initValue = 0}
|
||||
runMean = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
|
||||
|
|
|
@ -190,7 +190,7 @@ public:
|
|||
void AllocateAllMatrices(const std::vector<ComputationNodeBasePtr>& evalRootNodes, const std::vector<ComputationNodeBasePtr>& outValueRootNodes, ComputationNodeBasePtr trainRootNode);
|
||||
|
||||
private:
|
||||
template <class ElemType> void PrintMemorySharingStructure(const std::vector<ComputationNodeBasePtr>& nodes);
|
||||
void PrintMemorySharingStructure(const std::vector<ComputationNodeBasePtr>& nodes);
|
||||
void ReleaseMatricesAfterEvalForChildren(ComputationNodeBasePtr n, std::unordered_map<ComputationNodeBasePtr, int>& parentCount);
|
||||
void AllocateGradientMatricesForInputs(ComputationNodeBasePtr parentNode);
|
||||
|
||||
|
|
|
@ -803,33 +803,49 @@ void ComputationNetwork::MarkValueNonSharableNodes()
|
|||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void ComputationNetwork::PrintMemorySharingStructure(const std::vector<ComputationNodeBasePtr>& nodes)
|
||||
// print memory-sharing information to log
|
||||
void ComputationNetwork::PrintMemorySharingStructure(const vector<ComputationNodeBasePtr>& nodes)
|
||||
{
|
||||
std::map <const Matrix<ElemType>*, std::set<wstring>> memSharingStructure;
|
||||
for (auto& n : nodes)
|
||||
map <const MatrixBase*, set<wstring>> memSharingStructure;
|
||||
size_t numMatrices = 0;
|
||||
for (const auto& node : nodes)
|
||||
{
|
||||
ComputationNode<ElemType>* node = n->As<ComputationNode<ElemType>>();
|
||||
std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> matrixInfo = node->GetMatrixInfo();
|
||||
for (const auto&item : matrixInfo)
|
||||
set<pair<const MatrixBase*, wstring>> matrixInfo = node->GetMatrixInfo();
|
||||
for (const auto&item : matrixInfo) // {value} or {value, gradient}
|
||||
{
|
||||
const Matrix<ElemType>* matrix = item.first;
|
||||
if (memSharingStructure.find(matrix) == memSharingStructure.end())
|
||||
memSharingStructure.insert(std::pair<const Matrix<ElemType>*, std::set<wstring>>(matrix, std::set<wstring>()));
|
||||
|
||||
std::set<wstring>& s = memSharingStructure[matrix];
|
||||
s.insert(item.second);
|
||||
memSharingStructure[item.first].insert(item.second);
|
||||
numMatrices++;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "\nMemory Sharing Structure:\n\n");
|
||||
// count shared/unshared
|
||||
size_t numShared = 0;
|
||||
size_t numUnshared = 0;
|
||||
for (const auto& item : memSharingStructure)
|
||||
{
|
||||
const std::set<wstring>& s = item.second;
|
||||
fprintf(stderr, "%p: {", item.first);
|
||||
for (const auto& memShareInfo: s)
|
||||
if (item.second.size() < 2) // only print actually shared matrices
|
||||
numUnshared++;
|
||||
else
|
||||
numShared++;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\nMemory Sharing: Out of %d matrices, %d are shared as %d, and %d are not shared.\n\n", (int)numMatrices, (int)(numMatrices - numUnshared), (int)numShared, (int)numUnshared);
|
||||
for (const auto& item : memSharingStructure)
|
||||
{
|
||||
fprintf(stderr, "[%ls] ", memShareInfo.c_str());
|
||||
if (item.second.size() < 2) // only print actually shared matrices
|
||||
continue;
|
||||
// Format:
|
||||
// { node1
|
||||
// node2 }
|
||||
// { node3
|
||||
// node4
|
||||
// node5 }
|
||||
// where unshared nodes are not printed.
|
||||
const char* delim = "\t{ ";
|
||||
for (const auto& memShareInfo : item.second)
|
||||
{
|
||||
fprintf(stderr, "%s%ls", delim, memShareInfo.c_str());
|
||||
delim = "\n\t ";
|
||||
}
|
||||
fprintf(stderr, " }\n");
|
||||
}
|
||||
|
@ -987,16 +1003,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
|
|||
m_areMatricesAllocated = true;
|
||||
|
||||
// print the memory sharing structure
|
||||
std::vector<ComputationNodeBasePtr> allNodes = GetAllNodes();
|
||||
if (allNodes.size() == 0)
|
||||
LogicError("Network has no computation node.");
|
||||
|
||||
if (allNodes[0]->Is<ComputationNode<float>>())
|
||||
PrintMemorySharingStructure<float>(allNodes);
|
||||
else if (allNodes[0]->Is<ComputationNode<double>>())
|
||||
PrintMemorySharingStructure<double>(allNodes);
|
||||
else
|
||||
LogicError("Unexpected node precision type.");
|
||||
PrintMemorySharingStructure(GetAllNodes());
|
||||
}
|
||||
|
||||
void ComputationNetwork::ReleaseMatricesAfterEvalForChildren(ComputationNodeBasePtr n, std::unordered_map<ComputationNodeBasePtr, int>& parentCount)
|
||||
|
@ -1009,4 +1016,5 @@ void ComputationNetwork::ReleaseMatricesAfterEvalForChildren(ComputationNodeBase
|
|||
pNode->ReleaseMatricesAfterForwardProp(m_matrixPool);
|
||||
}
|
||||
}
|
||||
|
||||
}}}
|
||||
|
|
|
@ -645,6 +645,8 @@ public:
|
|||
ComputationEnvironmentPtr GetEnvironmentPtr() const { return m_environment; }
|
||||
void SetEnvironment(ComputationEnvironmentPtr environment) { m_environment = environment; }
|
||||
|
||||
virtual std::set<std::pair<const MatrixBase*, std::wstring>> GetMatrixInfo() const = 0; // to be defined by <ElemType> version
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// validation
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -1462,13 +1464,14 @@ public:
|
|||
// memory sharing
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
//this function is for displaying memeory sharing information
|
||||
// helper function for formatting memory sharing information
|
||||
// TODO: customize this function for all nodes that uses temp internal matrices.
|
||||
virtual std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> GetMatrixInfo()
|
||||
virtual std::set<std::pair<const MatrixBase*, std::wstring>> GetMatrixInfo() const override
|
||||
{
|
||||
std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> matrixInfo;
|
||||
matrixInfo.insert(make_pair(&Value(), NodeName() + L" Value" + msra::strfun::utf16(ShapeDescription())));
|
||||
matrixInfo.insert(make_pair(&Gradient(), NodeName() + L" Gradient" + msra::strfun::utf16(ShapeDescription())));
|
||||
std::set<std::pair<const MatrixBase*, std::wstring>> matrixInfo;
|
||||
matrixInfo.insert (make_pair(&Value(), NodeName() + L" : " + msra::strfun::utf16(ShapeDescription())));
|
||||
if (NeedsGradient())
|
||||
matrixInfo.insert(make_pair(&Gradient(), NodeName() + L" : " + msra::strfun::utf16(ShapeDescription()) + L" (gradient)"));
|
||||
return matrixInfo;
|
||||
}
|
||||
|
||||
|
@ -1868,6 +1871,7 @@ public:
|
|||
virtual bool RequiresPreCompute() const override { return false; } // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
|
||||
virtual std::string FormatOperationPrototype(const std::string& extraArgs) const override { return ""; }
|
||||
virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const override {}
|
||||
virtual std::set<std::pair<const MatrixBase*, std::wstring>> GetMatrixInfo() const override { NOT_IMPLEMENTED; }
|
||||
|
||||
protected: public: // needed in ComputationNetwork::FindInRecurrentLoops(), which really should be part of SEQTraversalFlowControlNode
|
||||
std::vector<ComputationNodeBasePtr> m_nestedNodes; // nodes tucked away in this node, in evaluation order
|
||||
|
|
|
@ -454,7 +454,7 @@ public:
|
|||
//LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
|
||||
// Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
|
||||
LogicError("Convolution weight matrix %ls should have dimension [(filter shape) x (input channels) x (output channels)]",
|
||||
Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
|
||||
Input(0)->NodeName().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1556,6 +1556,8 @@ template class DropoutNode<double>;
|
|||
// * scale is a LearnableParameter that stores scale vector (gamma term in the equation above).
|
||||
// * bias is a LearnableParameter that stores bias vector (beta term). scale and bias must have the same dimensions which must be equal
|
||||
// to the input dimensions in case of spatial = false or number of output convolution feature maps in case of spatial = true.
|
||||
// BUGBUG: Number of convolution feature maps are considered the last axis of the input.
|
||||
// More correct would be to infer that from broadcasting dimensions (spatial mode is broadcasting).
|
||||
// * runMean is the running mean which is used during evaluation phase and might be used during training as well.
|
||||
// It is represented as a LearnableParameter with the same dimensions as scale and bias.
|
||||
// * runInvStdDev is the running inverse square root of variance(so InvStdDev = 1 / sqrt(var + epsilon)).
|
||||
|
@ -1825,10 +1827,23 @@ public:
|
|||
|
||||
SetDims(Input(0));
|
||||
|
||||
// BUGBUG: Parameter dimensions are totally wrong. E.g. a valid spatial bias for [15 x 15 x 32] is currently [32 x 1].
|
||||
// The correct bias shape should be [1 x 1 x 32].
|
||||
#if 0 // This does not work.
|
||||
const auto& inputLayout = Input(0)->GetSampleLayout();
|
||||
|
||||
// infer dimensions of learnable parameters
|
||||
// BUGBUG: Parameter dimensions are totally wrong. E.g. a valid spatial bias for [15 x 15 x 32] is currently [32 x 1].
|
||||
// The correct bias shape should be [1 x 1 x 32]. That can be specified but leads to different results for unknown reasons.
|
||||
// Until this has been corrected, we need a workaround that infers the wrong dimensions.
|
||||
#if 1 // Workaround for today's definition: Trigger on [0 x 1] and infer that 0 as the total # elements needed.
|
||||
for (size_t i = 1; i < GetNumInputs(); i++)
|
||||
{
|
||||
auto paramLayout = Input(i)->GetSampleLayout();
|
||||
if (paramLayout.GetRank() == 2 && paramLayout[0] == 0 && paramLayout[1] == 1 && inputLayout.GetNumElements() > 0) // [0 x 1]
|
||||
{
|
||||
size_t total = m_spatial ? inputLayout.GetDims().back() : inputLayout.GetNumElements();
|
||||
Input(i)->ValidateInferInputDimsFrom(TensorShape(total, 1));
|
||||
}
|
||||
}
|
||||
#else
|
||||
// These are here only inferred like for elementwise operations. We must check more.
|
||||
ValidateNaryZip(isFinalValidationPass, /*allowBroadcast=*/ true, GetNumInputs());
|
||||
#endif
|
||||
|
@ -1836,7 +1851,6 @@ public:
|
|||
if (isFinalValidationPass)
|
||||
{
|
||||
// check inputs
|
||||
auto inputLayout = Input(0)->GetSampleLayout();
|
||||
for (size_t i = 1; i < GetNumInputs(); i++)
|
||||
{
|
||||
if (Input(i)->HasMBLayout())
|
||||
|
@ -1844,7 +1858,7 @@ public:
|
|||
auto paramLayout = Input(i)->GetSampleLayout();
|
||||
if (paramLayout != Input(1)->GetSampleLayout())
|
||||
InvalidArgument("%ls: Input[%d] has a layout different from Input[1]. All must be identical.", NodeDescription().c_str(), (int)i);
|
||||
#if 0 // This does not work. E.g. a valid spatial bias for [15 x 15 x 32] is currently [32 x 1], which is totally wrong.
|
||||
#if 0 // BUGBUG: For this to work, parameter shapes must be correct (cf. comment above on inference).
|
||||
if (paramLayout.GetRank() > inputLayout.GetRank())
|
||||
InvalidArgument("%ls: Input[%d] has a tensor rank greated than the data input.", NodeDescription().c_str(), (int)i);
|
||||
for (size_t k = 0; k < paramLayout.size(); k++)
|
||||
|
|
|
@ -276,7 +276,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
numNeedsGradient++;
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Training %.0f parameters in %d out of %d parameter tensors and %d nodes with gradient:\n",
|
||||
LOGPRINTF(stderr, "Training %.0f parameters in %d out of %d parameter tensors and %d nodes with gradient:\n\n",
|
||||
(double)numParameters, (int)nodesToUpdateDescriptions.size(), (int)learnableNodes.size(), (int)numNeedsGradient);
|
||||
for (let nodeDescription : nodesToUpdateDescriptions)
|
||||
{
|
||||
|
|
Загрузка…
Ссылка в новой задаче