implemented dimension inference for Convolution()

This commit is contained in:
Frank Seide 2016-08-09 21:08:28 -07:00
Родитель afb0175f45
Коммит 55673988af
3 изменённых файлов: 148 добавлений и 112 удалений

Просмотреть файл

@ -15,15 +15,6 @@ Train = [
action = "train"
BrainScriptNetworkBuilder = [
ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
W = LearnableParameter(outMap, inWCount, init = "gaussian", initValueScale = wScale)
b = ParameterTensor(1:1:outMap, initValue = bValue)
c = Convolution(W, inp, kW:kH:(inWCount/kW/kH), mapDims=outMap, stride=hStride:vStride:(inWCount/kW/kH), autoPadding = true:true:false)
p = Plus(c, b)
y = RectifiedLinear(p)
imageShape = 32:32:3
labelDim = 10
@ -38,15 +29,40 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
hStride1 = 1
vStride1 = 1
# weight[cMap1, kW1 * kH1 * ImageC]
conv1_act = ConvReLULayer(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, 0.0043, 0)
#conv1_act = ConvReLULayer1(cMap1, 75, kW1, kH1, hStride1, vStride1, 0.0043, 0) (featScaled)
#conv1_act = ConvReLULayer1(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, 0.0043, 0)
conv1_act = ConvolutionalLayer {cMap1, (5:5), activation = ReLU, init = "gaussian", initValueScale = 0.0043} (featScaled)
ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
#W = LearnableParameter(outMap, inWCount, init = "gaussian", initValueScale = wScale)
W = LearnableParameter(0, 0, init = "gaussian", initValueScale = wScale)
b = ParameterTensor(1:1:outMap, initValue = bValue)
c = Convolution(W, inp, kW:kH/*:(inWCount/kW/kH)*/, mapDims=outMap, stride=hStride:vStride/*:(inWCount/kW/kH)*/, autoPadding = true/*:true:false*/)
p = Plus(c, b)
y = RectifiedLinear(p)
ConvReLULayer1(outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
#W = LearnableParameter(outMap, inWCount, init = "gaussian", initValueScale = wScale)
W = LearnableParameter(0, 0, init = "gaussian", initValueScale = wScale)
b = ParameterTensor(1:1:outMap, initValue = bValue)
f(inp)= {
c = Convolution(W, inp, kW:kH/*:(inWCount/kW/kH)*/, mapDims=outMap, stride=hStride:vStride/*:(inWCount/kW/kH)*/, autoPadding = true/*:true:false*/)
p = Plus(c, b)
y = RectifiedLinear(p)
# pool1
pool1W = 3
pool1H = 3
pool1hStride = 2
pool1vStride = 2
#pool1W = 3
#pool1H = 3
#pool1hStride = 2
#pool1vStride = 2
#pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride)
pool1 = MaxPoolingLayer {(pool1W:pool1H), stride = (pool1hStride:pool1vStride)} (conv1_act)
pool1 = MaxPoolingLayer {(3:3), stride = (2:2)} (conv1_act)
# conv2
kW2 = 5
@ -56,14 +72,15 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
vStride2 = 1
# weight[cMap2, kW2 * kH2 * cMap1]
conv2_act = ConvReLULayer(pool1, cMap2, 800, kW2, kH2, hStride2, vStride2, 1.414, 0)
#conv2_act = ConvolutionalLayer {cMap2, (5:5), activation = ReLU, init = "gaussian", initValueScale = 1.414} (featScaled)
# pool2
pool2W = 3
pool2H = 3
pool2hStride = 2
pool2vStride = 2
#pool2W = 3
#pool2H = 3
#pool2hStride = 2
#pool2vStride = 2
#pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride)
pool2 = MaxPoolingLayer {(pool2W:pool2H), stride = (pool2hStride:pool2vStride)} (conv2_act)
pool2 = MaxPoolingLayer {(3:3), stride = (2:2)} (conv2_act)
# conv3
kW3 = 5
@ -73,47 +90,19 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
vStride3 = 1
# weight[cMap3, kW3 * kH3 * cMap2]
conv3_act = ConvReLULayer(pool2, cMap3, 800, kW3, kH3, hStride3, vStride3, 1.414, 0)
#conv3_act = ConvolutionalLayer {cMap3, (5:5), activation = ReLU, init = "gaussian", initValueScale = 1.414} (featScaled)
# pool3
pool3W = 3
pool3H = 3
pool3hStride = 2
pool3vStride = 2
#pool3W = 3
#pool3H = 3
#pool3hStride = 2
#pool3vStride = 2
#pool3 = MaxPooling(conv3_act, pool3W, pool3H, pool3hStride, pool3vStride)
pool3 = MaxPoolingLayer {(pool3W:pool3H), stride = (pool3hStride:pool3vStride)} (conv3_act)
pool3 = MaxPoolingLayer {(3:3), stride = (2:2)} (conv3_act)
#_PoolingLayer {poolKind, # "max" or "average"
# filterShape, # e.g. (3:3)
# stride = 1, autoPadding = true,
# lowerPad = 0, upperPad = 0} = # TODO: support this
# f(x) = Pooling (x, poolKind, kernelShape, stride = stride, autoPadding = autoPadding, lowerPad = lowerPad, upperPad = upperPad)
#DNNImageReLULayer(inW, inH, inC, outDim, x, wScale, bValue) =
# W = Parameter(outDim,inW*inH*inC, init = "gaussian", initValueScale = wScale)
# b = LearnableParameter(outDim, 1, initValue = bValue)
# t = Times(W, x)
# z = Plus(t, b)
# y = RectifiedLinear(z)
#h1 = DNNImageReLULayer(3, 3, cMap3, 64, pool3, 12, 0)
h1 = DenseLayer {64, activation = ReLU, init = "gaussian", initValueScale = 12} (pool3)
h1_d = Dropout(h1)
#DNNLastLayer(64, labelDim, x, wScale, bValue) =
# W = LearnableParameter(labelDim, 64, init = "gaussian", initValueScale = wScale)
# b = ParameterTensor(labelDim, initValue = bValue)
# t = Times(W, x)
# z = Plus(t, b)
#z = DNNLastLayer(64, labelDim, h1_d, 1.5, 0)
z = LinearLayer {labelDim, init = "gaussian", initValueScale = 1.5} (h1_d)

Просмотреть файл

@ -60,34 +60,52 @@ EmbeddingLayer {outDim, # dimension of embeddi
# out : [ (shifting dims)] | | (output dim) | (sample dims) ]
ConvolutionalLayer {numOutputChannels, # e.g. (1) or BS.Constants.None
filterShape, # e.g. (3:3)
bias = true,
activation = (x=>x),
init = "uniform",
initValueScale = 1,
#reductionRank = 1, # TODO: support this
stride = 1, autoPadding = true,
#lowerPad = 0, upperPad = 0, # TODO: support this
lowerPad = 0, upperPad = 0,
#transpose = false, # TODO: support this
maxTempMemSizeInSamples = 0} =
reductionRank = 1 # TODO: shall become an optional parameter
outputChannelsShape = Repeat (1, numOutputChannels) # Repeat(1) turns a scalar into a 1-element array
outputChannelsShape = _AsArray (numOutputChannels)
outputRank = Length (outputChannelsShape)
kernelShape = _ConcatArrays (filterShape, Repeat (reductionRank, 0)) # append reduction dims to filter dims
W = ParameterTensor{_ConcatArrays (kernelDims, outputChannelsShape), init=init}
autoPaddingPadded = _ConcatArrays (_ForceResizeArray (Length (kernelDims), autoPadding), Repeat (reductionRank, false)) # set padding flags for reduction dims to false
sharing = false # TODO: support this
f(x) = Convolution (W, x, kernelShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = autoPaddingPadded, lowerPad = lowerPad, upperPad = upperPad, transpose = transpose, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
filterRank = Length (filterShape)
kernelShape = _ConcatArrays (filterShape, Repeat (reductionRank, 0)) # kernel := filter plus reductionDims
W = ParameterTensor{_ConcatArrays ( kernelShape, outputChannelsShape), init = init, initValueScale = initValueScale} # [ W x H x C x K ]
#W = ParameterTensor{(outputChannelsShape:0), init = init, initValueScale = initValueScale} # old-style for backwards-compatible random initialization
b = ParameterTensor(_ConcatArrays (Repeat (Length (filterShape), 1), outputChannelsShape), initValue = 0) # [ 1 x 1 x K ]
#stridePadded =
# if (Length (_AsArray (stride))) == 1 then stride
# else _ConcatArrays (stride, Repeat (reductionRank, 0)) # gets inferred
#FixShapes (vec, val) = # padding vectors must be either length 1 or match kernel dim including reduction dims
# if Length (_AsArray (vec)) == 1 then vec
# else _ConcatArrays (_ForceResizeArray (Length (kernelShape), vec), Repeat (reductionRank, val)) # set padding flags for reduction dims to false
#autoPaddingPadded = FixShapes (autoPadding, false)
#lowerPadPadded = FixShapes (lowerPad, 0)
#upperPadPadded = FixShapes (upperPad, 0)
sharing = true # TODO: support this
transpose = false # TODO: support this
f(x) = {
c = Convolution (W, x, filterShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = autoPadding, lowerPad = lowerPad, upperPad = upperPad, transpose = transpose, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
res = activation (if bias then c + b else c)
# MaxPoolingLayer, AveragePoolingLayer -- create a max- or average-pooling layer
_PoolingLayer {poolKind, # "max" or "average"
filterShape, # e.g. (3:3)
stride = 1, autoPadding = true,
stride = 1, autoPadding = false,
lowerPad = 0, upperPad = 0} = # TODO: support this
f(x) = Pooling (x, poolKind, filterShape, stride = stride, autoPadding = autoPadding, lowerPad = lowerPad, upperPad = upperPad)
MaxPoolingLayer {filterShape, stride = 1, autoPadding = true, lowerPad = 0, upperPad = 0} =
MaxPoolingLayer {filterShape, stride = 1, autoPadding = false, lowerPad = 0, upperPad = 0} =
_PoolingLayer {"max", filterShape, stride = stride, autoPadding = autoPadding, lowerPad = lowerPad, upperPad = upperPad}
AveragePoolingLayer {filterShape, stride = 1, autoPadding = true, lowerPad = 0, upperPad = 0} =
AveragePoolingLayer {filterShape, stride = 1, autoPadding = false, lowerPad = 0, upperPad = 0} =
_PoolingLayer {"average", filterShape, stride = stride, autoPadding = autoPadding, lowerPad = lowerPad, upperPad = upperPad}
# RecurrentLSTMLayer -- create an LSTM layer
@ -424,7 +442,7 @@ ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ ope
ReconcileMBLayout = ReconcileDynamicAxis # back compat
CastAs (type, data) = ReconcileDynamicAxis (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
Convolution(weightNode, inputValueNode, kernelDims, mapDims = 0, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
# ND pooling/unpooling
# ND pooling/unpooling --why is autoPadding true? Normally one would want to reduce dimensions, no?
Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
MaxUnpooling(unpoolInput, poolInput, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxUnpooling' ; inputs = (unpoolInput : poolInput); kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
# 2D pooling
@ -826,7 +844,7 @@ RNNs =
# This function also takes an optional auxiliary input, e.g. for suporting attention models.
LSTMBlock (outputDim, cellShape=Constants.None, enableSelfStabilization=false) =
cellDim = if Constants.IsNone (cellShape) then outputDim else cellDim
cellDim = if Constants.IsNone (cellShape) then outputDim else cellShape
// parameter macros
# note: each invocation comes with its own set of weights
B{} = Parameters.BiasParam {cellDim}

Просмотреть файл

@ -25,14 +25,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
// - input : [C x W x H x T] or ARRAY[1..T] OF ARRAY[1..H] OF ARRAY[1..W] OF ARRAY[1..C]
// - output : [C' x W' x H' x T] or ARRAY[1..T] OF ARRAY[1..H'] OF ARRAY[1..W'] OF ARRAY[1..C']
// - filter : [C' x W" x H" x C ] or ARRAY[1..C] OF ARRAY[1..H"] OF ARRAY[1..W"] OF ARRAY[1..C']
// - output : [K x W' x H' x T] or ARRAY[1..T] OF ARRAY[1..H'] OF ARRAY[1..W'] OF ARRAY[1..K]
// - filter : [K x W" x H" x C ] or ARRAY[1..C] OF ARRAY[1..H"] OF ARRAY[1..W"] OF ARRAY[1..K]
// * cudnn ("CHW") mode (works both GPU and CPU): Channels are planes
// - input : [W x H x C x T] or ARRAY[1..T] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W]
// - output : [W' x H' x C' x T] or ARRAY[1..T] OF ARRAY[1..C'] OF ARRAY[1..H'] OF ARRAY[1..W']
// - filter : [W" x H" x C x C' ] or ARRAY[1..C'] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W]
// - output : [W' x H' x K x T] or ARRAY[1..T] OF ARRAY[1..K] OF ARRAY[1..H'] OF ARRAY[1..W']
// - filter : [W" x H" x C x K ] or ARRAY[1..K] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W]
// where:
// - using ' for output and " for filter
@ -41,7 +41,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// - C = input channels
// - 3 for color images, 1 for B&W images
// - for hidden layer: dimension of activation vector for each pixel
// - C' = output channels = dimension of activation vector for each pixel (also called N by NVidia, inconsistently)
// - K = output channels = dimension of activation vector for each pixel (also called N by NVidia, inconsistently)
// For ND-convolution/pooling only second format ('cudnn') is supported.
@ -149,6 +149,41 @@ public:
size_t MaxTempMemSizeInSamples() const { return m_maxTempMemSizeInSamples; }
PoolKind PoolingKind() const { return m_poolKind; }
// bottomlessly expand shape to filterRank, then expand to inputRank using defaults or given 'from' values
template<class V, typename T>
static void FixVectorShape(size_t filterRank, size_t inputRank, V& shape, T deflt, const V& from = V())
if (shape.size() == 0)
return; // let ComputeOutputShape() deal with this special case
// repeat the last value until we have the same rank as the filter
while (shape.size() < filterRank)
// increase to input rank
// If 'from' is given then clone the value from there. This is meant to be the input dimensions for convolution.
while (shape.size() < inputRank)
shape.push_back(shape.size() < from.size() ? from[shape.size()] : deflt);
static void FixTensorShape(size_t filterRank, size_t inputRank, TensorShape& shape, size_t deflt, const TensorShape& from = TensorShape())
auto dims = shape.GetDims();
FixVectorShape(filterRank, inputRank, dims, deflt, from.GetDims());
shape = TensorShape(dims);
// infer reduction dimensions if not given
void InferReductionDims(const TensorShape& inputShape, const TensorShape& fromShape)
// If kernel has a lower rank than the input then the remaining dimensions are to be reduced over.
size_t filterRank = m_kernelShape.size();
FixTensorShape(filterRank, inputShape.size(), m_kernelShape, 1, fromShape); // convolve over red dim; pool over 1
FixTensorShape(filterRank, inputShape.size(), m_stride, 1, fromShape); // stride for reduction dims is red dim or 1
FixVectorShape(filterRank, inputShape.size(), m_autoPad, false); // no padding for reduction dims
FixTensorShape(filterRank, inputShape.size(), m_lowerPad, 0);
FixTensorShape(filterRank, inputShape.size(), m_upperPad, 0);
FixVectorShape(filterRank, inputShape.size(), m_sharing, true);
TensorShape m_kernelShape;
TensorShape m_mapCount;
@ -369,6 +404,8 @@ public:
inputShape = GetInputSampleLayout(inputIdx);
// infer reduction dimensions if not given
InferReductionDims(inputShape, inputShape);
if (!m_transpose)
outputShape = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
@ -385,6 +422,25 @@ public:
// ConvolveGeometry always uses CHW.
SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());
// update LearnableParameter if it has 0 dimensions (to be inferred)
// Typically this would be the #inputChannels (C).
if (Input(0)->GetSampleLayout().GetNumElements() == 0)
// BUGBUG: Inference does not support sharing. Problem is that we have the information too late.
// In this case, users will have to specify the correct dimensions. Good luck.
#if 1 // old style for back compat with previous results. Randomization will differ.
if (Input(0)->GetSampleLayout().GetRank() == 2)
Input(0)->ValidateInferInputDimsFrom(TensorShape(m_mapCount.GetNumElements(), m_kernelShape.GetNumElements()));
auto weightShape = m_kernelShape.GetDims();
for (auto outDim : m_mapCount.GetDims())
if (isFinalValidationPass)
if (m_convEng == nullptr)
@ -397,10 +453,11 @@ public:
ConvolutionEngineKind::All, NodeName());
if (Input(0)->GetAsMatrixNumCols() != m_kernelShape.GetNumElements() ||
Input(0)->GetAsMatrixNumRows() != m_convEng->Geometry()->KernelCount())
if (Input(0)->GetSampleLayout().GetNumElements() != m_kernelShape.GetNumElements() * m_convEng->Geometry()->KernelCount())
LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
//LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
// Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
LogicError("Convolution weight matrix %ls should have dimension [(filter shape) x (input channels) x (output channels)]",
Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
@ -489,22 +546,6 @@ public:
return m_poolKind == PoolKind::Max;
// add 'reductionDims' dimensions to 'shape', copying from 'from' or 'deflt'
template<class V, typename T>
static void FixVectorShape(size_t reductionDims, V& shape, T deflt)
size_t targetRank = shape.size() + reductionDims;
if (shape.size() < targetRank)
shape.resize(targetRank, deflt);
// else let ComputeOutputShape() deal with the failure
static void FixTensorShape(size_t reductionDims, TensorShape& shape, size_t deflt)
auto dims = shape.GetDims();
FixVectorShape(reductionDims, dims, deflt);
shape = TensorShape(dims);
void Validate(bool isFinalValidationPass) override
@ -519,26 +560,10 @@ public:
"and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
auto inputShape = GetInputSampleLayout(0);
// make kernel shape etc. look like convolution parameters, i.e. create nominal reduction dimensions
// In older versions, it was expected that pooling takes kernel shapes like convolution,
// which included the reduction dim(s). It makes more sense to not require users to
// include them for pooing, which the padding below accounts for.
if (inputShape.size() > m_kernelShape.size()) // user specified only the pooling-area shape: add the missing dims
size_t reductionDims = inputShape.size() - m_kernelShape.size(); // number of missing dims--these are reduction dims
FixTensorShape(reductionDims, m_kernelShape, 1); // pool over 1 in reduction dimension
if (m_stride.GetRank() != 1)
FixTensorShape(reductionDims, m_stride, 1); // stride for reduction dims is 1
if (m_autoPad.size() != 1)
FixVectorShape(reductionDims, m_autoPad, false); // no padding for reduction dims
if (m_lowerPad.GetRank() != 1)
FixTensorShape(reductionDims, m_lowerPad, 0);
if (m_upperPad.GetRank() != 1)
FixTensorShape(reductionDims, m_upperPad, 0);
if (m_sharing.size() != 1)
FixVectorShape(reductionDims, m_sharing, false); // dummy
const auto& inputShape = GetInputSampleLayout(0);
// infer reduction dimensions if not given
InferReductionDims(inputShape, TensorShape());
auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
@ -634,6 +659,10 @@ public:
auto inputShape = GetInputSampleLayout(0);
// infer reduction dimensions if not given
InferReductionDims(inputShape, TensorShape());
// Same as in case of deconvolution, node input (inputShape) is really the output of the max pooling
// and node output (outDims) is pooling input.
auto outputShape = ConvolveGeometry::ComputeInputShape(inputShape, m_kernelShape, m_mapCount, m_stride,