diff --git a/Examples/Image/Miscellaneous/CIFAR-10/TutorialImage.cntk b/Examples/Image/Miscellaneous/CIFAR-10/TutorialImage.cntk
index 914a24f07..c7f27e367 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/TutorialImage.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/TutorialImage.cntk
@@ -15,15 +15,6 @@ Train = [
     action = "train"
 
     BrainScriptNetworkBuilder = [
-ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
-[
-    W = LearnableParameter(outMap, inWCount, init = "gaussian", initValueScale = wScale)
-    b = ParameterTensor(1:1:outMap, initValue  = bValue)
-    c = Convolution(W, inp, kW:kH:(inWCount/kW/kH), mapDims=outMap, stride=hStride:vStride:(inWCount/kW/kH), autoPadding = true:true:false)
-    p = Plus(c, b)
-    y = RectifiedLinear(p)
-].y
-
         imageShape = 32:32:3
         labelDim = 10
 
@@ -38,15 +29,40 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
             hStride1 = 1
             vStride1 = 1
             # weight[cMap1, kW1 * kH1 * ImageC]
-            conv1_act = ConvReLULayer(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, 0.0043, 0)
+            #conv1_act = ConvReLULayer1(cMap1, 75, kW1, kH1, hStride1, vStride1, 0.0043, 0) (featScaled)
+            #conv1_act = ConvReLULayer1(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, 0.0043, 0)
+            conv1_act = ConvolutionalLayer {cMap1, (5:5), activation = ReLU, init = "gaussian", initValueScale = 0.0043} (featScaled)
+
+ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
+[
+    #W = LearnableParameter(outMap, inWCount, init = "gaussian", initValueScale = wScale)
+    W = LearnableParameter(0, 0, init = "gaussian", initValueScale = wScale)
+    b = ParameterTensor(1:1:outMap, initValue  = bValue)
+    c = Convolution(W, inp, kW:kH/*:(inWCount/kW/kH)*/, mapDims=outMap, stride=hStride:vStride/*:(inWCount/kW/kH)*/, autoPadding = true/*:true:false*/)
+    p = Plus(c, b)
+    y = RectifiedLinear(p)
+].y
+ConvReLULayer1(outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
+[
+    #W = LearnableParameter(outMap, inWCount, init = "gaussian", initValueScale = wScale)
+    W = LearnableParameter(0, 0, init = "gaussian", initValueScale = wScale)
+    b = ParameterTensor(1:1:outMap, initValue  = bValue)
+    f(inp)= {
+    c = Convolution(W, inp, kW:kH/*:(inWCount/kW/kH)*/, mapDims=outMap, stride=hStride:vStride/*:(inWCount/kW/kH)*/, autoPadding = true/*:true:false*/)
+    p = Plus(c, b)
+    y = RectifiedLinear(p)
+    }.y
+].f
+
+
 
             # pool1
-            pool1W = 3
-            pool1H = 3
-            pool1hStride = 2
-            pool1vStride = 2
+            #pool1W = 3
+            #pool1H = 3
+            #pool1hStride = 2
+            #pool1vStride = 2
             #pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride)
-            pool1 = MaxPoolingLayer {(pool1W:pool1H), stride = (pool1hStride:pool1vStride)} (conv1_act)
+            pool1 = MaxPoolingLayer {(3:3), stride = (2:2)} (conv1_act)
 
             # conv2
             kW2 = 5
@@ -56,14 +72,15 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
             vStride2 = 1
             # weight[cMap2, kW2 * kH2 * cMap1]
             conv2_act = ConvReLULayer(pool1, cMap2, 800, kW2, kH2, hStride2, vStride2, 1.414, 0)
+            #conv2_act = ConvolutionalLayer {cMap2, (5:5), activation = ReLU, init = "gaussian", initValueScale = 1.414} (featScaled)
 
             # pool2
-            pool2W = 3
-            pool2H = 3
-            pool2hStride = 2
-            pool2vStride = 2
+            #pool2W = 3
+            #pool2H = 3
+            #pool2hStride = 2
+            #pool2vStride = 2
             #pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride)
-            pool2 = MaxPoolingLayer {(pool2W:pool2H), stride = (pool2hStride:pool2vStride)} (conv2_act)
+            pool2 = MaxPoolingLayer {(3:3), stride = (2:2)} (conv2_act)
 
             # conv3
             kW3 = 5
@@ -73,47 +90,19 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
             vStride3 = 1
             # weight[cMap3, kW3 * kH3 * cMap2]
             conv3_act = ConvReLULayer(pool2, cMap3, 800, kW3, kH3, hStride3, vStride3, 1.414, 0)
+            #conv3_act = ConvolutionalLayer {cMap3, (5:5), activation = ReLU, init = "gaussian", initValueScale = 1.414} (featScaled)
 
             # pool3
-            pool3W = 3
-            pool3H = 3
-            pool3hStride = 2
-            pool3vStride = 2
+            #pool3W = 3
+            #pool3H = 3
+            #pool3hStride = 2
+            #pool3vStride = 2
             #pool3 = MaxPooling(conv3_act, pool3W, pool3H, pool3hStride, pool3vStride)
-            pool3 = MaxPoolingLayer {(pool3W:pool3H), stride = (pool3hStride:pool3vStride)} (conv3_act)
+            pool3 = MaxPoolingLayer {(3:3), stride = (2:2)} (conv3_act)
 
-#_PoolingLayer {poolKind,            # "max" or "average"
-#               filterShape,         # e.g. (3:3)
-#               stride = 1, autoPadding = true,
-#               lowerPad = 0, upperPad = 0} = # TODO: support this
-#{
-#    f(x) = Pooling (x, poolKind, kernelShape, stride = stride, autoPadding = autoPadding, lowerPad = lowerPad, upperPad = upperPad)
-#}.f
-
-
-
-#DNNImageReLULayer(inW, inH, inC, outDim, x, wScale, bValue) =
-#[
-#    W = Parameter(outDim,inW*inH*inC, init = "gaussian",   initValueScale = wScale)
-#    b = LearnableParameter(outDim, 1,         initValue  = bValue)
-#    t = Times(W, x)
-#    z = Plus(t, b)
-#    y = RectifiedLinear(z)
-#].y
-            #h1 = DNNImageReLULayer(3, 3, cMap3, 64, pool3, 12, 0)
             h1 = DenseLayer {64, activation = ReLU, init = "gaussian", initValueScale = 12} (pool3)
             h1_d = Dropout(h1)
 
-#DNNLastLayer(64, labelDim, x, wScale, bValue) =
-#[
-#    W = LearnableParameter(labelDim, 64, init = "gaussian", initValueScale = wScale)
-#    b = ParameterTensor(labelDim, initValue = bValue)
-#    t = Times(W, x)
-#    z = Plus(t, b)
-#].z
-
-            #z = DNNLastLayer(64, labelDim, h1_d, 1.5, 0)
-
             z = LinearLayer {labelDim, init = "gaussian", initValueScale = 1.5} (h1_d)
         }.z
 
diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
index 720bdae58..a542e5b52 100644
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -60,34 +60,52 @@ EmbeddingLayer {outDim,                                   # dimension of embeddi
 #    out    : [ (shifting dims)] |                   |  (output dim)  |  (sample dims) ]
 ConvolutionalLayer {numOutputChannels,   # e.g. (1) or BS.Constants.None
                     filterShape,         # e.g. (3:3)
+                    bias = true,
+                    activation = (x=>x),
                     init = "uniform",
+                    initValueScale = 1,
                     #reductionRank = 1,          # TODO: support this
                     stride = 1, autoPadding = true,
-                    #lowerPad = 0, upperPad = 0, # TODO: support this
+                    lowerPad = 0, upperPad = 0,
                     #transpose = false,          # TODO: support this
                     maxTempMemSizeInSamples = 0} =
 {
     reductionRank = 1 # TODO: shall become an optional parameter
-    outputChannelsShape = Repeat (1, numOutputChannels) # Repeat(1) turns a scalar into a 1-element array
+    outputChannelsShape = _AsArray (numOutputChannels)
     outputRank = Length (outputChannelsShape)
-    kernelShape = _ConcatArrays (filterShape, Repeat (reductionRank, 0)) # append reduction dims to filter dims
-    W = ParameterTensor{_ConcatArrays (kernelDims, outputChannelsShape), init=init}
-    autoPaddingPadded = _ConcatArrays (_ForceResizeArray (Length (kernelDims), autoPadding), Repeat (reductionRank, false)) # set padding flags for reduction dims to false
-    sharing = false # TODO: support this
-    f(x) = Convolution (W, x, kernelShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = autoPaddingPadded, lowerPad = lowerPad, upperPad = upperPad, transpose = transpose, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
+    filterRank = Length (filterShape)
+    kernelShape = _ConcatArrays (filterShape, Repeat (reductionRank, 0)) # kernel := filter plus reductionDims
+    W = ParameterTensor{_ConcatArrays (                kernelShape,      outputChannelsShape), init = init, initValueScale = initValueScale}  # [ W x H x C x K ]
+    #W = ParameterTensor{(outputChannelsShape:0), init = init, initValueScale = initValueScale}  # old-style for backwards-compatible random initialization
+    b = ParameterTensor(_ConcatArrays (Repeat (Length (filterShape), 1), outputChannelsShape), initValue = 0)                                 # [ 1 x 1 x     K ]
+    #stridePadded =
+    #    if (Length (_AsArray (stride))) == 1 then stride
+    #    else _ConcatArrays (stride, Repeat (reductionRank, 0)) # gets inferred
+    #FixShapes (vec, val) = # padding vectors must be either length 1 or match kernel dim including reduction dims
+    #    if Length (_AsArray (vec)) == 1 then vec
+    #    else _ConcatArrays (_ForceResizeArray (Length (kernelShape), vec), Repeat (reductionRank, val)) # set padding flags for reduction dims to false
+    #autoPaddingPadded = FixShapes (autoPadding, false)
+    #lowerPadPadded = FixShapes (lowerPad, 0)
+    #upperPadPadded = FixShapes (upperPad, 0)
+    sharing = true    # TODO: support this
+    transpose = false # TODO: support this
+    f(x) = {
+        c = Convolution (W, x, filterShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = autoPadding, lowerPad = lowerPad, upperPad = upperPad, transpose = transpose, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
+        res = activation (if bias then c + b else c)
+    }.res
 }.f
 
 # MaxPoolingLayer, AveragePoolingLayer -- create a max- or average-pooling layer
 _PoolingLayer {poolKind,            # "max" or "average"
                filterShape,         # e.g. (3:3)
-               stride = 1, autoPadding = true,
+               stride = 1, autoPadding = false,
                lowerPad = 0, upperPad = 0} = # TODO: support this
 {
     f(x) = Pooling (x, poolKind, filterShape, stride = stride, autoPadding = autoPadding, lowerPad = lowerPad, upperPad = upperPad)
 }.f
-MaxPoolingLayer {filterShape, stride = 1, autoPadding = true, lowerPad = 0, upperPad = 0} =
+MaxPoolingLayer {filterShape, stride = 1, autoPadding = false, lowerPad = 0, upperPad = 0} =
     _PoolingLayer {"max", filterShape, stride = stride, autoPadding = autoPadding, lowerPad = lowerPad, upperPad = upperPad}
-AveragePoolingLayer {filterShape, stride = 1, autoPadding = true, lowerPad = 0, upperPad = 0} =
+AveragePoolingLayer {filterShape, stride = 1, autoPadding = false, lowerPad = 0, upperPad = 0} =
     _PoolingLayer {"average", filterShape, stride = stride, autoPadding = autoPadding, lowerPad = lowerPad, upperPad = upperPad}
 
 # RecurrentLSTMLayer -- create an LSTM layer
@@ -424,7 +442,7 @@ ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ ope
 ReconcileMBLayout = ReconcileDynamicAxis # back compat
 CastAs (type, data) = ReconcileDynamicAxis (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
 Convolution(weightNode, inputValueNode, kernelDims, mapDims = 0, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
-# ND pooling/unpooling
+# ND pooling/unpooling   --why is autoPadding true? Normally one would want to reduce dimensions, no?
 Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 MaxUnpooling(unpoolInput, poolInput, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxUnpooling' ; inputs = (unpoolInput : poolInput); kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 # 2D pooling
@@ -826,7 +844,7 @@ RNNs =
     # This function also takes an optional auxiliary input, e.g. for suporting attention models.
     LSTMBlock (outputDim, cellShape=Constants.None, enableSelfStabilization=false) =
     [
-        cellDim = if Constants.IsNone (cellShape) then outputDim else cellDim
+        cellDim = if Constants.IsNone (cellShape) then outputDim else cellShape
         // parameter macros
         # note: each invocation comes with its own set of weights
         B{} = Parameters.BiasParam {cellDim}
diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h
index d50056cdd..5210cae36 100644
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@@ -24,15 +24,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 //    This follows "high performance convolutional neural networks for document processing" by Kumar Chellapilla, Sidde Puri, and Patrice Simard.
 //    Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
 //
-//     - input :  [C  x W  x H      x T]  or  ARRAY[1..T] OF                ARRAY[1..H]  OF ARRAY[1..W]  OF ARRAY[1..C]
-//     - output : [C' x W' x H'     x T]  or  ARRAY[1..T] OF                ARRAY[1..H'] OF ARRAY[1..W'] OF ARRAY[1..C']
-//     - filter : [C' x W" x H" x C    ]  or                 ARRAY[1..C] OF ARRAY[1..H"] OF ARRAY[1..W"] OF ARRAY[1..C']
+//     - input :  [C x W  x H      x T]  or  ARRAY[1..T] OF                ARRAY[1..H]  OF ARRAY[1..W]  OF ARRAY[1..C]
+//     - output : [K x W' x H'     x T]  or  ARRAY[1..T] OF                ARRAY[1..H'] OF ARRAY[1..W'] OF ARRAY[1..K]
+//     - filter : [K x W" x H" x C    ]  or                 ARRAY[1..C] OF ARRAY[1..H"] OF ARRAY[1..W"] OF ARRAY[1..K]
 //
 // * cudnn ("CHW") mode (works both GPU and CPU): Channels are planes
 //
-//     - input :   [W  x H  x C       x T]   or  ARRAY[1..T] OF                 ARRAY[1..C]  OF ARRAY[1..H]  OF ARRAY[1..W]
-//     - output :  [W' x H' x      C' x T]   or  ARRAY[1..T] OF ARRAY[1..C'] OF                 ARRAY[1..H'] OF ARRAY[1..W']
-//     - filter :  [W" x H" x C  x C'    ]   or                 ARRAY[1..C'] OF ARRAY[1..C]  OF ARRAY[1..H]  OF ARRAY[1..W]
+//     - input :   [W  x H  x C      x T]   or  ARRAY[1..T] OF                ARRAY[1..C]  OF ARRAY[1..H]  OF ARRAY[1..W]
+//     - output :  [W' x H' x      K x T]   or  ARRAY[1..T] OF ARRAY[1..K] OF                 ARRAY[1..H'] OF ARRAY[1..W']
+//     - filter :  [W" x H" x C  x K    ]   or                 ARRAY[1..K] OF ARRAY[1..C]  OF ARRAY[1..H]  OF ARRAY[1..W]
 //
 // where:
 //  - using ' for output and " for filter
@@ -41,7 +41,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 //  - C = input channels
 //     - 3 for color images, 1 for B&W images
 //     - for hidden layer: dimension of activation vector for each pixel
-//  - C' = output channels = dimension of activation vector for each pixel (also called N by NVidia, inconsistently)
+//  - K = output channels = dimension of activation vector for each pixel (also called N by NVidia, inconsistently)
 //
 // For ND-convolution/pooling only second format ('cudnn') is supported.
 // 
@@ -149,6 +149,41 @@ public:
     size_t MaxTempMemSizeInSamples() const { return m_maxTempMemSizeInSamples; }
     PoolKind PoolingKind() const { return m_poolKind; }
 
+private:
+    // bottomlessly expand shape to filterRank, then expand to inputRank using defaults or given 'from' values
+    template<class V, typename T>
+    static void FixVectorShape(size_t filterRank, size_t inputRank, V& shape, T deflt, const V& from = V())
+    {
+        if (shape.size() == 0)
+            return; // let ComputeOutputShape() deal with this special case
+        // repeat the last value until we have the same rank as the filter
+        while (shape.size() < filterRank)
+            shape.push_back(shape.back());
+        // increase to input rank
+        // If 'from' is given then clone the value from there. This is meant to be the input dimensions for convolution.
+        while (shape.size() < inputRank)
+            shape.push_back(shape.size() < from.size() ? from[shape.size()] : deflt);
+    }
+    static void FixTensorShape(size_t filterRank, size_t inputRank, TensorShape& shape, size_t deflt, const TensorShape& from = TensorShape())
+    {
+        auto dims = shape.GetDims();
+        FixVectorShape(filterRank, inputRank, dims, deflt, from.GetDims());
+        shape = TensorShape(dims);
+    }
+protected:
+    // infer reduction dimensions if not given
+    void InferReductionDims(const TensorShape& inputShape, const TensorShape& fromShape)
+    {
+        // If kernel has a lower rank than the input then the remaining dimensions are to be reduced over.
+        size_t filterRank = m_kernelShape.size();
+        FixTensorShape(filterRank, inputShape.size(), m_kernelShape, 1,     fromShape); // convolve over red dim; pool over 1
+        FixTensorShape(filterRank, inputShape.size(), m_stride,      1,     fromShape); // stride for reduction dims is red dim or 1
+        FixVectorShape(filterRank, inputShape.size(), m_autoPad,     false);            // no padding for reduction dims
+        FixTensorShape(filterRank, inputShape.size(), m_lowerPad,    0);
+        FixTensorShape(filterRank, inputShape.size(), m_upperPad,    0);
+        FixVectorShape(filterRank, inputShape.size(), m_sharing,     true);
+    }
+
 protected:
     TensorShape m_kernelShape;
     TensorShape m_mapCount;
@@ -369,6 +404,8 @@ public:
         else
         {
             inputShape = GetInputSampleLayout(inputIdx);
+            // infer reduction dimensions if not given
+            InferReductionDims(inputShape, inputShape);
             if (!m_transpose)
             {
                 outputShape = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
@@ -385,6 +422,25 @@ public:
         // ConvolveGeometry always uses CHW.
         SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());
 
+        // update LearnableParameter if it has 0 dimensions (to be inferred)
+        // Typically this would be the #inputChannels (C).
+        if (Input(0)->GetSampleLayout().GetNumElements() == 0)
+        {
+            // BUGBUG: Inference does not support sharing. Problem is that we have the information too late.
+            //         In this case, users will have to specify the correct dimensions. Good luck.
+#if 1       // old style for back compat with previous results. Randomization will differ.
+            if (Input(0)->GetSampleLayout().GetRank() == 2)
+                Input(0)->ValidateInferInputDimsFrom(TensorShape(m_mapCount.GetNumElements(), m_kernelShape.GetNumElements()));
+            else
+#endif
+            {
+                auto weightShape = m_kernelShape.GetDims();
+                for (auto outDim : m_mapCount.GetDims())
+                    weightShape.push_back(outDim);
+                Input(0)->ValidateInferInputDimsFrom(TensorShape(weightShape));
+            }
+        }
+
         if (isFinalValidationPass)
         {
             if (m_convEng == nullptr)
@@ -397,10 +453,11 @@ public:
                                                                 ConvolutionEngineKind::All, NodeName());
             }
 
-            if (Input(0)->GetAsMatrixNumCols() != m_kernelShape.GetNumElements() ||
-                Input(0)->GetAsMatrixNumRows() != m_convEng->Geometry()->KernelCount())
+            if (Input(0)->GetSampleLayout().GetNumElements() != m_kernelShape.GetNumElements() * m_convEng->Geometry()->KernelCount())
             {
-                LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
+                //LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
+                //           Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
+                LogicError("Convolution weight matrix %ls should have dimension [(filter shape) x (input channels) x (output channels)]",
                            Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
             }
         }
@@ -489,22 +546,6 @@ public:
         return m_poolKind == PoolKind::Max;
     }
 
-private:
-    // add 'reductionDims' dimensions to 'shape', copying from 'from' or 'deflt'
-    template<class V, typename T>
-    static void FixVectorShape(size_t reductionDims, V& shape, T deflt)
-    {
-        size_t targetRank = shape.size() + reductionDims;
-        if (shape.size() < targetRank)
-            shape.resize(targetRank, deflt);
-        // else let ComputeOutputShape() deal with the failure
-    }
-    static void FixTensorShape(size_t reductionDims, TensorShape& shape, size_t deflt)
-    {
-        auto dims = shape.GetDims();
-        FixVectorShape(reductionDims, dims, deflt);
-        shape = TensorShape(dims);
-    }
 public:
     void Validate(bool isFinalValidationPass) override
     {
@@ -519,26 +560,10 @@ public:
                 "and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
         }
 
-        auto inputShape = GetInputSampleLayout(0);
-        // make kernel shape etc. look like convolution parameters, i.e. create nominal reduction dimensions
-        // In older versions, it was expected that pooling takes kernel shapes like convolution,
-        // which included the reduction dim(s). It makes more sense to not require users to
-        // include them for pooing, which the padding below accounts for.
-        if (inputShape.size() > m_kernelShape.size())   // user specified only the pooling-area shape: add the missing dims
-        {
-            size_t reductionDims = inputShape.size() - m_kernelShape.size(); // number of missing dims--these are reduction dims
-            FixTensorShape(reductionDims,     m_kernelShape, 1);             // pool over 1 in reduction dimension
-            if (m_stride.GetRank() != 1)
-                FixTensorShape(reductionDims, m_stride,      1);             // stride for reduction dims is 1
-            if (m_autoPad.size() != 1)
-                FixVectorShape(reductionDims, m_autoPad,     false);         // no padding for reduction dims
-            if (m_lowerPad.GetRank() != 1)
-                FixTensorShape(reductionDims, m_lowerPad,    0);
-            if (m_upperPad.GetRank() != 1)
-                FixTensorShape(reductionDims, m_upperPad,    0);
-            if (m_sharing.size() != 1)
-                FixVectorShape(reductionDims, m_sharing,     false); // dummy
-        }
+        const auto& inputShape = GetInputSampleLayout(0);
+
+        // infer reduction dimensions if not given
+        InferReductionDims(inputShape, TensorShape());
 
         auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
                                                             m_sharing, m_autoPad, m_lowerPad, m_upperPad);
@@ -634,6 +659,10 @@ public:
         }
 
         auto inputShape = GetInputSampleLayout(0);
+
+        // infer reduction dimensions if not given
+        InferReductionDims(inputShape, TensorShape());
+
         // Same as in case of deconvolution, node input (inputShape) is really the output of the max pooling
         // and node output (outDims) is pooling input.
         auto outputShape = ConvolveGeometry::ComputeInputShape(inputShape, m_kernelShape, m_mapCount, m_stride,