cleanedup ConvolutionNode vs. image interpretation of TensorShape;

TensorOp() optimization to use SGEMM disabled for 'double' in Debug builds, so we get our code path tested once in a while; fixed ConvolutionEngineTests.cpp w.r.t. Create(); removed unused InputIsImage() methods
2016-01-01 12:25:24 -08:00 · 2016-01-01 12:25:24 -08:00 · 0c79c928ff
--- a/Source/CNTK/SynchronousExecutionEngine.cpp
+++ b/Source/CNTK/SynchronousExecutionEngine.cpp
@ -111,7 +111,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1; // BUGBUG: This comes through MBLayout, and should be forbidden.
                ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));

-                nodePtr = builder.CreateInputNode(name, ImageLayout(imageWidth, imageHeight, imageChannels, imageLayoutKind), numImages);
+                nodePtr = builder.CreateInputNode(name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind), numImages);
            }
        }
        else if (cnNodeType == L"SparseImageInput")
@ -129,7 +129,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
                ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));

-                nodePtr = builder.CreateSparseInputNode(name, ImageLayout(imageWidth, imageHeight, imageChannels, imageLayoutKind), numImages);
+                nodePtr = builder.CreateSparseInputNode(name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind), numImages);
            }
        }
        else if (OperationNameOf(LearnableParameter) == cnNodeType)
@ -325,7 +325,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t img_channels = node->GetOptionalParameter("imageChannels", "0");

                bool needGradient = node->GetOptionalParameter("needGradient", "false");
-                nodePtr = builder.Reshape(NULL, num_rows, ImageLayoutWHC(img_width, img_height, img_channels), name);   // BUGBUG: use a tensor descriptor instead
+                nodePtr = builder.Reshape(NULL, num_rows, ImageDimensions::AsTensorShape(img_width, img_height, img_channels, ImageLayoutKind::HWC/*legacy*/), name);   // BUGBUG: use a tensor descriptor instead
                nodePtr->SetParameterUpdateRequired(needGradient);
            }
        }
--- a/Source/Common/Include/DataTensor.h
+++ b/Source/Common/Include/DataTensor.h
@ -249,12 +249,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        const SmallVector<ptrdiff_t> & GetStrides() const { return m_strides; }

        // interpretation as an image tensor
-        size_t GetNumChannels() const { if (m_dims.empty()) return 0; else return m_dims.size() > 0 ? m_dims[0] : 1; }
-        size_t GetWidth()       const { if (m_dims.empty()) return 0; else return m_dims.size() > 1 ? m_dims[1] : 1; }
-        size_t GetHeight()      const { if (m_dims.empty()) return 0; else return m_dims.size() > 2 ? m_dims[2] : 1; }
-        // heuristics used for pretty-printing
-        // TODO: This will go away.
-        bool IsInputAnImage() const { return GetRank() == 3 && (GetWidth() != 1 || GetNumChannels() != 1); }
+        //size_t GetNumChannels() const { if (m_dims.empty()) return 0; else return m_dims.size() > 0 ? m_dims[0] : 1; }
+        //size_t GetWidth()       const { if (m_dims.empty()) return 0; else return m_dims.size() > 1 ? m_dims[1] : 1; }
+        //size_t GetHeight()      const { if (m_dims.empty()) return 0; else return m_dims.size() > 2 ? m_dims[2] : 1; }
+        // legacy helper function for RowSliceNode. Will go away.
        bool IsVectorStoredAsImage() const { return GetRank() == 3 && m_dims[0] == 1 && m_dims[1] == 1; }

        // indexing
@ -468,19 +466,40 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        else if (s == L"HWC" || s == L"legacy") return ImageLayoutKind::HWC;
        else InvalidArgument("ImageLayoutKindFrom: Unknown ImageLayoutKind '%ls', must be 'CHW' (cudnn) or 'HWC' (CNTK legacy)", s.c_str());
    }
-    static inline TensorShape ImageLayout(size_t width, size_t height, size_t channels, ImageLayoutKind imageLayoutKind)
-    {
-        if       (imageLayoutKind == ImageLayoutKind::CHW) return TensorShape(width, height, channels);
-        else  if (imageLayoutKind == ImageLayoutKind::HWC) return TensorShape(channels, width, height);
-        else LogicError("ImageLayout: Invalid ImageLayoutKind");
-    }

-    // When constructing an image tensor with the usual W, H, C format, use the following function instead.
-    // This will sort the three parameters into the correct order.
-    // BUGBUG: This only works for ImageLayoutKind::HWC. Also the naming is bad.
-    static inline TensorShape ImageLayoutWHC(size_t width, size_t height, size_t channels)
+    // interpret TensorShape as an image descriptor
+    // considering that we support two ways of storingimages
+    struct ImageDimensions
    {
-        return TensorShape(channels, width, height);
-    }
+        size_t m_width, m_height, m_numChannels;
+        // interpret TensorShape as image
+        ImageDimensions(const TensorShape & shape, ImageLayoutKind imageLayoutKind)
+        {
+            if (shape.GetRank() != 3)
+                InvalidArgument("Convolution operation currently only supports 1D or 2D convolution on 3D tensors.");
+            if (imageLayoutKind == ImageLayoutKind::CHW)
+            {
+                m_width       = shape[0];
+                m_height      = shape[1];
+                m_numChannels = shape[2];
+            }
+            else  if (imageLayoutKind == ImageLayoutKind::HWC)
+            {
+                m_width       = shape[1];
+                m_height      = shape[2];
+                m_numChannels = shape[0];
+            }
+            else LogicError("WHC: Invalid ImageLayoutKind");
+        }
+        ImageDimensions(size_t width, size_t height, size_t numChannels) : m_width(width), m_height(height), m_numChannels(numChannels) {}
+        // intepret image as TensorShape
+        static TensorShape AsTensorShape(size_t width, size_t height, size_t numChannels, ImageLayoutKind imageLayoutKind/* = ImageLayoutKind::HWC*/)
+        {
+            if       (imageLayoutKind == ImageLayoutKind::CHW) return TensorShape(width, height, numChannels);
+            else  if (imageLayoutKind == ImageLayoutKind::HWC) return TensorShape(numChannels, width, height);
+            else LogicError("ImageLayout: Invalid ImageLayoutKind");
+        }
+        TensorShape AsTensorShape(ImageLayoutKind imageLayoutKind) { return AsTensorShape(m_width, m_height, m_numChannels, imageLayoutKind); }
+    };

 }}}
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -503,9 +503,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    }

                    const char * mbSizeMark = child->m_pMBLayout ? "MBSize " : "";
-                    if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout.GetWidth() != 1 || child->m_sampleLayout.GetNumChannels() != 1))  // looks like an image: use WHC notation
+                    if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1))  // looks like an image: use WHC notation
                        fprintf(stderr, "%ls[%lu {W=%lu, H=%lu, C=%lu}, %s%lu]", child->NodeName().c_str(), child->GetNumRows(),
-                                child->m_sampleLayout.GetWidth(), child->m_sampleLayout.GetHeight(), child->m_sampleLayout.GetNumChannels(), mbSizeMark, child->GetNumCols());
+                                child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0], mbSizeMark, child->GetNumCols());
+                    //BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct.
                    else if (child->m_sampleLayout.GetRank() > 1)           // tensor: output the tensor dimensions   --TODO: there will be no numRows in the future, only the tensor
                        fprintf(stderr, "%ls[%lu [%s], %s%lu]", child->NodeName().c_str(), child->GetNumRows(), string(child->m_sampleLayout).c_str(), mbSizeMark, child->GetNumCols());
                    else
@ -540,14 +541,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop;
        }

-        // TODO: Remove this.
-        // used from:
-        //  - Plus/Minus/ElementTimesNode --> replace by max dim over inputs. Make this standard behavior for all binary element-wise ops.
-        bool IsInputAnImage(const size_t index) const
-        {
-            return m_inputs[index]->m_sampleLayout.IsInputAnImage();
-        }
-
        const size_t GetNumInputs() const { return m_inputs.size(); }

        virtual void SetInput(const size_t childIndex, const ComputationNodeBasePtr& node) = 0;
@ -1528,7 +1521,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #define UsingComputationNodeMembers /*without OperationName; needed to support inconsistent pattern of InputValue--TODO: This comment it out of date. */    \
 protected: \
    typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr; \
-    using Base::m_deviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
+    using Base::m_deviceId; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
    using Base::m_pMBLayout; using Base::GetNumTimeSteps; using Base::GetNumParallelSequences; \
    using Base::MaskMissingColumnsToZero; using Base::MaskMissingValueColumnsToZero; using Base::MaskMissingGradientColumnsToZero; using Base::InvalidateMissingValueColumns; using Base::InvalidateMissingGradientColumns; \
    using Base::DataFor; using Base::ValueFor; using Base::Gradient; using Base::GradientFor; \
@ -1547,7 +1540,7 @@ protected: \
    using Base::DumpNodeInfo; using Base::EnumerateNodes; \
    using Base::HasMBLayout; using Base::GetMBLayout; using Base::LinkToMBLayout; \
    using Base::Input; using Base::SetInput; \
-    using Base::IsInputAnImage; using Base::IsEqualTo; using Base::IsOutputOlderThanInputs; using Base::IsLeaf; using Base::SetParameterUpdateRequired; \
+    using Base::IsEqualTo; using Base::IsOutputOlderThanInputs; using Base::IsLeaf; using Base::SetParameterUpdateRequired; \
    using Base::Load; \
    using Base::PrintNodeValuesToFile; using Base::PrintSelfBeforeValidation; \
    using Base::Save; using Base::UpdateFunctionMBSize; \
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -70,17 +70,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_zeroPadding(false), m_maxTempMemSizeInSamples(SIZE_MAX),
            m_imageLayoutKind(ImageLayoutKind::HWC)
        {
-            SetDims(ImageLayoutWHC(1, 1, 0), 0);           // TODO: what is this magic #channels == 0? Can this even be initialized at this time, or only inferred?
+            SetDims(ImageDimensions::AsTensorShape(1, 1, 0, m_imageLayoutKind), 0);
        }
        ConvolutionNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample,
                        const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0, ImageLayoutKind imageLayoutKind = ImageLayoutKind::HWC) :
            Base(deviceId, name),
+            m_outputChannels(outputChannels),
            m_kernelWidth(kernelWidth), m_kernelHeight(kernelHeight),
            m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample),
            m_zeroPadding(zeroPadding), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples),
            m_imageLayoutKind(imageLayoutKind)
        {
-            SetDims(ImageLayoutWHC(1, 1, outputChannels), 0);
+            SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0); // TODO: necessary?
            m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
        }
        ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp) :
@ -97,7 +98,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Base::Save(fstream);
            fstream << m_kernelWidth << m_kernelHeight << m_horizontalSubsample << m_verticalSubsample;
            uint32_t imageLayoutKind = (uint32_t)m_imageLayoutKind;
-            uint32_t outputChannels = m_sampleLayout.GetNumChannels();
+            uint32_t outputChannels = (uint32_t)m_outputChannels;
            fstream << imageLayoutKind << outputChannels;
            fstream << m_zeroPadding << m_maxTempMemSizeInSamples;
        }
@ -109,7 +110,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            uint32_t imageLayoutKind, outputChannels;
            fstream >> imageLayoutKind >> outputChannels;
            m_imageLayoutKind = (ImageLayoutKind) imageLayoutKind;
-            SetDims(ImageLayoutWHC(1, 1, outputChannels), 0);   // TODO: Save this separately.
+            m_outputChannels = outputChannels;
+            SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0);  // TODO: needed?
            fstream >> m_zeroPadding >> m_maxTempMemSizeInSamples;
            m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
        }
@ -165,20 +167,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return false;
        }

-        // memory allocation goes here, also tensor reshaping
-        virtual void UpdateFunctionMBSize() override
-        {
-            Base::UpdateFunctionMBSize();
-            // TODO: change tensor dims here (setN())
-        }
-
        void ForwardProp(const FrameRange & fr) override
        {
            const Matrix<ElemType>& input0 = Input(0)->Value();
            Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
            Matrix<ElemType> sliceOutputValue = ValueFor(fr);

-            // REVIEW alexeyk: setting batch size, can it be done elsewhere in a single place?  TODO: Yes, in UpdateFunctionMBSize().
+            // update the tensor dimension w.r.t. number of samples
            size_t batchSize = sliceInput1Value.GetNumCols();
            m_inT->setN(batchSize);
            m_outT->setN(batchSize);
@ -212,83 +207,60 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Base::Validate(isFinalValidationPass);
            InferMBLayoutFromInputsForStandardCase();

-            // get input tensor shape
-            auto inputSampleLayout = GetInputSampleLayout(1);
-            auto sampleLayout = m_sampleLayout;
+            // get input and output tensor shape and interpret as image dimensions
+            auto inDims  = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);

-            // the GetWidth() etc. functions below assume legacy format, so convert it if needed
-            // TODO: The right way is to swap this--use cudnn layout, which is our natural format (W,H,C),
-            //       adapt GetWidth() etc. to that, and eliminate all notions of ImageLayoutWHC(), instead
-            //       switch that for a legacy layout, but only for the purpose of this mangling right here.
-            if (m_imageLayoutKind == ImageLayoutKind::CHW)
-            {
-                inputSampleLayout = ImageLayoutWHC(inputSampleLayout[0], inputSampleLayout[1], inputSampleLayout[2]);
-                sampleLayout = ImageLayoutWHC(sampleLayout[0], sampleLayout[1], sampleLayout[2]);
-            }
-
-            if (inputSampleLayout.GetWidth() < m_kernelWidth || inputSampleLayout.GetHeight() < m_kernelHeight)
-                InvalidArgument("inputWidth must >= kernelWidth and inputHeight must >= kernelHeight.");
+            if (inDims.m_width < m_kernelWidth || inDims.m_height < m_kernelHeight)
+                InvalidArgument("%ls %ls operation requires that input width be >= kernelWidth and input height >= kernelHeight.", NodeName().c_str(), OperationName().c_str());

            // determine output tensor shape
-            // BUGBUG: For cudnn, tensor is not WHC. How can we propagate this?
-            // TODO: This is the point where we need to know which tensor dimension refers to W,H,C.
-            //       We should enforce rank and then somehow know which is which. But how? It's an option to the reader. Use the same option? Must be serialized...
-            // WATCH OUT: Number of channels is tucked away in sampleLayout and must be propagated.
-            TensorShape outputSampleLayout;
-            if (m_zeroPadding)
-            {
-                const int kernelWidthCenter = m_kernelWidth % 2;
-                const int kernelHeightCenter = m_kernelHeight % 2;
-                outputSampleLayout = ImageLayoutWHC(
-                    (inputSampleLayout.GetWidth()  - kernelWidthCenter)  / m_horizontalSubsample + 1,
-                    (inputSampleLayout.GetHeight() - kernelHeightCenter) / m_verticalSubsample   + 1,
-                    sampleLayout.GetNumChannels());
-            }
-            else
-            {
-                outputSampleLayout = ImageLayoutWHC(
-                    (inputSampleLayout.GetWidth()  - m_kernelWidth)  / m_horizontalSubsample + 1,
-                    (inputSampleLayout.GetHeight() - m_kernelHeight) / m_verticalSubsample   + 1,
-                    sampleLayout.GetNumChannels());
-            }
+            const int kernelWidthCenter  = m_zeroPadding ?  m_kernelWidth % 2 : m_kernelWidth;
+            const int kernelHeightCenter = m_zeroPadding ? m_kernelHeight % 2 : m_kernelHeight;
+            auto outDims = ImageDimensions(
+                (inDims.m_width  - kernelWidthCenter)  / m_horizontalSubsample + 1,
+                (inDims.m_height - kernelHeightCenter) / m_verticalSubsample   + 1,
+                m_outputChannels);

-            size_t weightCols = m_kernelWidth * m_kernelHeight * inputSampleLayout.GetNumChannels();
+            size_t weightCols = m_kernelWidth * m_kernelHeight * inDims.m_numChannels;

+            // check/infer input [0] (weights)
            if (Input(0)->Value().HasNoElements())
-                ValidateInferInputDims(0, outputSampleLayout.GetNumChannels(), weightCols);
+                ValidateInferInputDims(0, m_outputChannels, weightCols);

-            if (isFinalValidationPass && (Input(0)->GetNumCols() != weightCols || Input(0)->GetNumRows() != outputSampleLayout.GetNumChannels()))
-                LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int)outputSampleLayout.GetNumChannels(), (int)weightCols);
+            if (isFinalValidationPass && (Input(0)->GetNumCols() != weightCols || Input(0)->GetNumRows() != m_outputChannels))
+                LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int)m_outputChannels, (int)weightCols);

-            size_t inputDim = inputSampleLayout.GetWidth() * inputSampleLayout.GetHeight() * inputSampleLayout.GetNumChannels();
+            // check/infer input [1] (data)
+            size_t inputDim = inDims.m_width * inDims.m_height * inDims.m_numChannels;
            if (Input(1)->GetNumRows() == 0)
                ValidateInferInputDims(1, inputDim, Input(1)->GetNumCols());

            if (isFinalValidationPass && Input(1)->GetNumRows() != inputDim)
-                LogicError("Each column of input to the convolution node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels.", NodeName().c_str(), (int)inputDim);
+                LogicError("Each column of inDims to the convolution node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels.", NodeName().c_str(), (int)inputDim);

            // that's our dimension
-            SetDims(ImageLayout(outputSampleLayout.GetWidth(), outputSampleLayout.GetHeight(), outputSampleLayout.GetNumChannels(), m_imageLayoutKind), Input(1)->GetNumCols());
+            SetDims(outDims.AsTensorShape(m_imageLayoutKind), Input(1)->GetNumCols());

            // set up the various engines and descriptor objects
            // REVIEW alexeyk: is there a better place to create engines?
+            assert(m_factory);
+            //if (m_factory == nullptr)
+            //    m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
            // TODO: This seems to expose too much internal knowlegde of the engine to the ConvolutionNode().
            //       Why not just pass everything to the engine creator, and get one object that holds everything.
-            if (m_factory == nullptr)
-                m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
            if (m_convEng == nullptr)
                m_convEng = m_factory->CreateConvEngine(m_deviceId, m_maxTempMemSizeInSamples);
            if (m_inT == nullptr)
-                m_inT = m_factory->CreateTensor(inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels(), 1);
+                m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
            if (m_filterT == nullptr)
-                m_filterT = m_factory->CreateFilter(m_kernelWidth, m_kernelHeight, inputSampleLayout.GetNumChannels(), sampleLayout.GetNumChannels());
+                m_filterT = m_factory->CreateFilter(m_kernelWidth, m_kernelHeight, inDims.m_numChannels, m_outputChannels);
            if (m_outT == nullptr)
-                m_outT = m_factory->CreateTensor(sampleLayout.GetWidth(), sampleLayout.GetHeight(), sampleLayout.GetNumChannels(), 1);
+                m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
            if (m_convDesc == nullptr)
                m_convDesc = m_factory->CreateConvDescriptor(*m_inT, *m_filterT, m_horizontalSubsample, m_verticalSubsample, m_zeroPadding);
-            // REVIEW alexeyk: create per-channel (shared) bias. Consider adding other types of biases.
+            // REVIEW alexeyk: create per-channel bias (shared across all pixels). Consider adding other types of biases.
            if (m_biasT == nullptr)
-                m_biasT = m_factory->CreateTensor(1, 1, sampleLayout.GetNumChannels(), 1);
+                m_biasT = m_factory->CreateTensor(1, 1, outDims.m_numChannels, 1);
        }

        void DumpNodeInfo(const bool printValues, File& fstream) const override
@ -298,11 +270,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            auto inputSampleLayout = GetInputSampleLayout(1);

            char str[4096];
-            sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu]  \n", inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels());
+            sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu]  \n", inputSampleLayout[1], inputSampleLayout[2], inputSampleLayout[0]);
            fstream << string(str);
            sprintf(str, "Kernel[Width:%lu, Height:%lu]  SubSample[Horizontal:%lu, Vertical:%lu]\n", m_kernelWidth, m_kernelHeight, m_horizontalSubsample, m_verticalSubsample);
            fstream << string(str);
-            sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu]  \n", m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels());
+            sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu]  \n", m_sampleLayout[1], m_sampleLayout[2], m_sampleLayout[0]);
            fstream << string(str);
            sprintf(str, "ZeroPadding=%ls  maxTempMemSizeInSamples=%lu\n", m_zeroPadding? L"true" : L"false", m_maxTempMemSizeInSamples);
            fstream << string(str);
@ -328,15 +300,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

    private:
-        std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
-        std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
-
-        std::unique_ptr<ConvolutionTensor4D> m_inT;
-        std::unique_ptr<ConvolutionFilter> m_filterT;
-        std::unique_ptr<ConvolutionTensor4D> m_outT;
-        std::unique_ptr<ConvolutionDescriptor> m_convDesc;
-        std::unique_ptr<ConvolutionTensor4D> m_biasT;
-
+        size_t m_outputChannels;
        size_t m_kernelWidth, m_kernelHeight;
        size_t m_horizontalSubsample, m_verticalSubsample;
        bool m_zeroPadding;
@ -346,6 +310,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        size_t m_maxTempMemSizeInSamples;   // can change during runtime

        ImageLayoutKind m_imageLayoutKind;  // how to interpret the tensor (which dimensions are X/Y and C)
+
+        std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
+        std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
+
+        std::unique_ptr<ConvolutionTensor4D> m_inT;
+        std::unique_ptr<ConvolutionFilter> m_filterT;
+        std::unique_ptr<ConvolutionTensor4D> m_outT;
+        std::unique_ptr<ConvolutionDescriptor> m_convDesc;
+        std::unique_ptr<ConvolutionTensor4D> m_biasT;
    };

    template class ConvolutionNode<float>; 
@ -372,7 +345,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_windowWidth(windowWidth), m_windowHeight(windowHeight),
            m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample)
        {
-            m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto);
+            m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto/*..., m_imageLayoutKind*/);
        }
        PoolingNodeBase(const ScriptableObjects::IConfigRecordPtr configp) :
            PoolingNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"))
@ -445,20 +418,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Base::Validate(isFinalValidationPass);
            InferMBLayoutFromInputsForStandardCase();

-            // get input tensor shape
-            auto inputSampleLayout = GetInputSampleLayout(0);
+            // get input tensor shape and interpret as image dimensions
+    const auto m_imageLayoutKind = ImageLayoutKind::HWC;        // BUGBUG: Finish this. Must be serialized.
+            auto inDims = ImageDimensions(GetInputSampleLayout(0), m_imageLayoutKind);

-            if (inputSampleLayout.GetWidth() < m_windowWidth || inputSampleLayout.GetHeight() < m_windowHeight)
+            if (inDims.m_width < m_windowWidth || inDims.m_height < m_windowHeight)
                InvalidArgument("PoolingNodeBase: inputWidth must >= windowWidth and inputHeight must >= windowHeight.");

            // determine output tensor shape
-            auto outputSampleLayout = ImageLayoutWHC(
-                (inputSampleLayout.GetWidth()  - m_windowWidth)  / m_horizontalSubsample + 1,
-                (inputSampleLayout.GetHeight() - m_windowHeight) / m_verticalSubsample + 1,
-                inputSampleLayout.GetNumChannels());
+            auto outDims = ImageDimensions(
+                (inDims.m_width  - m_windowWidth)  / m_horizontalSubsample + 1,
+                (inDims.m_height - m_windowHeight) / m_verticalSubsample   + 1,
+                inDims.m_numChannels);

-            m_inputSizePerSample = inputSampleLayout.GetWidth() * inputSampleLayout.GetHeight() * inputSampleLayout.GetNumChannels();
-            //m_outputSizePerSample = outputSampleLayout.GetWidth() * outputSampleLayout.GetHeight() * outputSampleLayout.GetNumChannels();
+            m_inputSizePerSample = inDims.m_width * inDims.m_height * inDims.m_numChannels;

            if (Input(0)->GetNumRows() == 0)
                ValidateInferInputDims(0, m_inputSizePerSample, Input(0)->GetNumCols());    // TODO: We should infer a tensor dimension for the input instead.
@ -466,18 +439,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (isFinalValidationPass && Input(0)->GetNumRows() != m_inputSizePerSample)    // TODO: Can be removed once tensor shape and numRows are perfectly in sync.
                LogicError("each column of input to the MaxPooling node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels", NodeName().c_str(), (int)m_inputSizePerSample);

-            SetDims(outputSampleLayout, Input(0)->GetNumCols());
+            SetDims(outDims.AsTensorShape(m_imageLayoutKind), Input(0)->GetNumCols());

            // set up various engines and descriptor objects
            // REVIEW alexeyk: is there a better place to create engines?
            if (m_factory == nullptr)
-                m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto);
+                m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
            if (m_poolEng == nullptr)
                m_poolEng = m_factory->CreatePoolEngine(m_deviceId);
            if (m_inT == nullptr)
-                m_inT = m_factory->CreateTensor(inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels(), 1);
+                m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
            if (m_outT == nullptr)
-                m_outT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
+                m_outT = m_factory->CreateTensor(m_sampleLayout[1], m_sampleLayout[2], m_sampleLayout[0], 1);
        }

        void DumpNodeInfo(const bool printValues, File& fstream) const override
@ -487,11 +460,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            auto inputSampleLayout = GetInputSampleLayout(0);

            char str[4096];
-            sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu]  \n", inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels());
+            sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu]  \n", inputSampleLayout[1], inputSampleLayout[2], inputSampleLayout[0]);
            fstream << string(str);
            sprintf(str, "PoolingWindow[Width:%lu, Height:%lu]  SubSampling[Horizontal:%lu, Vertical:%lu]\n", m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample);
            fstream << string(str);
-            sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu]  \n", m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels());
+            sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu]  \n", m_sampleLayout[1], m_sampleLayout[2], m_sampleLayout[0]);
            fstream << string(str);
            sprintf(str, "TotalSizePerSample[Input:%lu, Output:%lu]  \n", m_inputSizePerSample, m_outputSizePerSample);
            fstream << string(str);
@ -740,18 +713,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            SetDims(Input(0));

+    const auto m_imageLayoutKind = ImageLayoutKind::HWC;        // BUGBUG: Finish this. Must be serialized.
+            auto dims = ImageDimensions(GetSampleLayout(), m_imageLayoutKind);
+
            if (m_factory == nullptr)
-                m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto);
+                m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
            if (m_convEng == nullptr)
                m_convEng = m_factory->CreateConvEngine(m_deviceId, 0);
            if (m_inT == nullptr)
-                m_inT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
+                m_inT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
            if (m_scaleBiasT == nullptr)
            {
                if (m_spatial)
-                    m_scaleBiasT = m_factory->CreateTensor(1, 1, m_sampleLayout.GetNumChannels(), 1);
+                    m_scaleBiasT = m_factory->CreateTensor(1, 1, dims.m_numChannels, 1);
                else
-                    m_scaleBiasT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
+                    m_scaleBiasT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
            }
        }

--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@ -279,7 +279,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (!isImage)
                Init(configp->Get(L"shape"), isSparse);
            else
-                Init(ImageLayout(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))), isSparse);
+                Init(ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))), isSparse);
        }
    public:

--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -170,7 +170,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_targetImageLayout(imageLayout)
        { }
        ReshapeNode(const ScriptableObjects::IConfigRecordPtr configp) :
-            ReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"numRows"), ImageLayoutWHC(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels")))
+            ReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"numRows"), ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKind::HWC/*legacy*/))
        {
            // BUGBUG: We should not operate on image layouts here, but on a proper tensor layout.
            AttachInputs(configp, this->GetExpectedNumInputs());
@ -215,7 +215,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                else
                    fprintf(stderr, "%ls[%lu, %lu]", child->NodeName().c_str(), child->GetNumRows(), child->GetNumCols());
            }
-            fprintf(stderr, ", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout.GetWidth(), m_targetImageLayout.GetHeight(), m_targetImageLayout.GetNumChannels());
+            fprintf(stderr, ", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout[1], m_targetImageLayout[2], m_targetImageLayout[0]);
+            // BUGBUG: This interpretaion as image dims is only correct for the 'legacy format, not for cudnn.
        }

        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
@ -248,7 +249,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            // setting any dimension to 0 means lose the tensor, flatten to vector
            // TODO: We can use 0 to indicate "infer". One value can be 0. It will be filled in to match row dim.
-            if (m_targetImageLayout.GetWidth() == 0 || m_targetImageLayout.GetHeight() == 0 || m_targetImageLayout.GetNumChannels() == 0)
+            if (m_targetImageLayout[1] == 0 || m_targetImageLayout[2] == 0 || m_targetImageLayout[0] == 0)
            {
                if (Input(0)->HasSampleLayout())
                    fprintf(stderr, "WARNING: Reshape operation cannot inherit image size information from its child. Image size info is lost.\n");
@ -382,32 +383,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // TODO: Say in one sentence what this logic does.
        void InferTargetSampleLayout()
        {
-            // BUGBUG: We should not operate on image layouts here, but on a proper tensor layout.
-            if (m_targetImageLayout.GetWidth() > 0)
+            // BUGBUG: Below is the result of refactoring and only works for rank-3 tensors. Generalize.
+            if (m_targetImageLayout[1] > 0)
            {
-                if (m_targetImageLayout.GetHeight() > 0)
+                if (m_targetImageLayout[2] > 0)
                {
-                    if (m_targetImageLayout.GetNumChannels() > 0)
+                    if (m_targetImageLayout[0] > 0)
                    {
                        if (m_targetImageLayout.GetNumElements() != m_numTargetRows)
                            RuntimeError("Image dimensions do not match row size.");
                    }
                    else
                    {
-                        if (m_numTargetRows % (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetHeight()) > 0)
+                        if (m_numTargetRows % (m_targetImageLayout[1] * m_targetImageLayout[2]) > 0)
                            RuntimeError("Image row size is not a multiple of specified image dimensions.");
                        else
-                            m_targetImageLayout = ImageLayoutWHC(m_targetImageLayout.GetWidth(), m_targetImageLayout.GetHeight(), m_numTargetRows / (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetHeight()));
+                            m_targetImageLayout = TensorShape(m_numTargetRows / (m_targetImageLayout[1] * m_targetImageLayout[2]), m_targetImageLayout[1], m_targetImageLayout[2]);
                    }
                }
                else
                {
-                    if (m_targetImageLayout.GetNumChannels() > 0)
+                    if (m_targetImageLayout[0] > 0)
                    {
-                        if (m_numTargetRows % (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetNumChannels()) > 0)
+                        if (m_numTargetRows % (m_targetImageLayout[1] * m_targetImageLayout[0]) > 0)
                            RuntimeError("Image row size is not a multiple of specified image dimensions.");
                        else
-                            m_targetImageLayout = ImageLayoutWHC(m_targetImageLayout.GetWidth(), m_numTargetRows / (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetNumChannels()), m_targetImageLayout.GetNumChannels());
+                            m_targetImageLayout = TensorShape(m_targetImageLayout[0], m_targetImageLayout[1], m_numTargetRows / (m_targetImageLayout[1] * m_targetImageLayout[0]));
                    }
                    else
                    {
@ -417,22 +418,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            else
            {
-                if (m_targetImageLayout.GetHeight() > 0)
+                if (m_targetImageLayout[2] > 0)
                {
-                    if (m_targetImageLayout.GetNumChannels() > 0)
+                    if (m_targetImageLayout[0] > 0)
                    {
-                        if (m_numTargetRows % (m_targetImageLayout.GetHeight() * m_targetImageLayout.GetNumChannels()) > 0)
+                        if (m_numTargetRows % (m_targetImageLayout[2] * m_targetImageLayout[0]) > 0)
                            RuntimeError("Image row size is not a multiple of specified image dimensions.");
                        else
-                            m_targetImageLayout = ImageLayoutWHC(m_numTargetRows / (m_targetImageLayout.GetHeight() * m_targetImageLayout.GetNumChannels()), m_targetImageLayout.GetHeight(), m_targetImageLayout.GetNumChannels());
+                            m_targetImageLayout = TensorShape(m_targetImageLayout[0], m_numTargetRows / (m_targetImageLayout[2] * m_targetImageLayout[0]), m_targetImageLayout[2]);
                    }
                    else
                        RuntimeError("At least two image dimensions must be specified.");
                }
-                else if (m_targetImageLayout.GetNumChannels() > 0)
+                else if (m_targetImageLayout[0] > 0)
                    RuntimeError("At least two image dimensions must be specified.");
                else
-                    m_targetImageLayout = ImageLayoutWHC(m_numTargetRows, 1, 1);
+                    m_targetImageLayout = TensorShape(1, m_numTargetRows, 1);
            }
        }
    };
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -4473,6 +4473,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // Note: A minor risk is that with this, our own reduction function will rarely be used.
        // That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
        else if (op == ElementWiseOperator::opCopy &&                                                       // we are just adding to target without any further operation
+#ifdef _DEBUG
+                 sizeof(ElemType) == sizeof(float) &&                                                       // in debug don't shortcut 'double' so we have some test of our own codepath
+#endif
                 regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 &&     // we are processing a column
                 reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t)regularOpDims[0])        // reducing across columns and no overlap
        {
--- a/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
+++ b/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
@ -26,7 +26,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
    {
        try
        {
-            return ConvFact::Create(0, ConvFact::EngineType::CuDnn) != nullptr;
+            return ConvFact::Create(0, ConvFact::EngineType::CuDnn, ImageLayoutKind::CHW) != nullptr;
        }
        catch (std::runtime_error)
        {
@ -55,7 +55,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto);
            auto tt = typeid(fact).name();
            UNUSED(tt);
            auto eng = fact->CreateConvEngine(deviceId, 0);
@ -128,7 +128,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { -1, 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto);
            auto eng = fact->CreateConvEngine(deviceId, 0);
            auto inT = fact->CreateTensor(inW, inH, cmapIn, n);
            auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
@ -175,7 +175,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto);
            auto eng = fact->CreateConvEngine(deviceId, 0);
            auto srcGradT = fact->CreateTensor(outW, outH, cmapOut, n);
            auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
@ -231,7 +231,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto);
            auto eng = fact->CreateConvEngine(deviceId, 0);
            auto srcGradT = fact->CreateTensor(outW, outH, cmapOut, n);
            auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
@ -296,7 +296,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto);
            auto eng = fact->CreatePoolEngine(deviceId);
            auto inT = fact->CreateTensor(inW, inH, cmap, n);
            auto outT = fact->CreateTensor(outW, outH, cmap, n);
@ -346,7 +346,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto);
            auto eng = fact->CreatePoolEngine(deviceId);
            auto inT = fact->CreateTensor(inW, inH, cmap, n);
            auto outT = fact->CreateTensor(outW, outH, cmap, n);
@ -406,7 +406,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto);
            auto eng = fact->CreatePoolEngine(deviceId);
            auto inT = fact->CreateTensor(inW, inH, cmap, n);
            auto outT = fact->CreateTensor(outW, outH, cmap, n);
@ -456,7 +456,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto);
            auto eng = fact->CreatePoolEngine(deviceId);
            auto inT = fact->CreateTensor(inW, inH, cmap, n);
            auto outT = fact->CreateTensor(outW, outH, cmap, n);