Merge branch 'master' into qiwye/multiverso

Conflicts: Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj Source/SGDLib/SGDLib.vcxproj Source/SGDLib/SGDLib.vcxproj.filters
2016-01-07 16:57:09 +08:00 · 2016-01-07 16:57:09 +08:00 · 6c2ee1aa51
--- a/1
+++ b/1
@ -240,6 +240,7 @@ MATH_SRC =\
 ifdef CUDA_PATH
 MATH_SRC +=\
 	$(SOURCEDIR)/Math/GPUMatrix.cu \
+	$(SOURCEDIR)/Math/GPUTensor.cu \
 	$(SOURCEDIR)/Math/GPUSparseMatrix.cu \
 	$(SOURCEDIR)/Math/GPUWatcher.cu \
 	$(SOURCEDIR)/Math/MatrixQuantizerGPU.cu \
--- a/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
+++ b/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
@ -35,27 +35,32 @@ using namespace std;
        ;

    wstring computationNodes =  // TODO: use actual TypeName() here? would first need to make it a wide string; we should also extract those two methods into the base macro
-        L"LearnableParameter(rows, cols, needGradient = true, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' /*plus the function args*/ ]\n"
+        L"LearnableParameter(rows, cols, needGradient = true, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (rows : cols) ] /*plus the function args*/ ]\n"
        L"Parameter = LearnableParameter // deprecated \n"
+        L"ParameterTensor(dims, needGradient = true, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
        // ^^ already works; vv untested
-        L"Input(rows, cols, tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = false /*plus the function args*/ ]\n" // note: naming a little inconsistent  // TODO: re-test after flag change
-        L"SparseInput(rows, cols, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = false /*plus the function args*/ ]\n"
-        L"ImageInput(imageWidth, imageHeight, imageChannels, numImages, tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]\n"
-        L"SparseImageInput(imageWidth, imageHeight, imageChannels, numImages, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]\n"
+        L"Input(dims, tag='feature') = new ComputationNode [ operation = 'InputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]\n" // note: naming a little inconsistent  // TODO: re-test after flag change
+        L"SparseInput(dims, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]\n"
+        L"ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]\n"
+        L"SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]\n"
        L"Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, needGradient = false, init = 'fixedValue', value = val) \n"
-        L"PastValue(rows, cols, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input /*plus the function args*/ ]\n"
-        L"FutureValue(rows, cols, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input /*plus the function args*/ ]\n"
+        L"PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
+        L"FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
        // TODO: ^^ DelayedValues no longer need to know their dimension. That is inferred in Validation.
        L"RowSlice(startIndex, numRows, input, needGradient = false, tag='') = new ComputationNode [ operation = 'RowSlice' ; inputs = input /*plus the function args*/ ]\n"
        L"RowRepeat(input, numRepeats, needGradient = false, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]\n"
        L"RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]\n"
-        L"Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input /*plus the function args*/ ]\n"
+        L"Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'DeprecatedReshape' ; inputs = input /*plus the function args*/ ]\n"
+        L"NewReshape(input, dims, beginDim=0, endDim=0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
+        L"ReshapeDimension(x, dim, tensorShape) = NewReshape(x, tensorShape, beginDim=dim, endDim=dim + 1) \n"
+        L"FlattenDimensions(x, dim, num) = NewReshape(x, 0, beginDim=dim, endDim=dim + num) \n"
+        L"SplitDimension(x, dim, N) = ReshapeDimension(x, dim, 0:N) \n"
        L"Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability) /*plus the function args*/ ]\n"
        L"WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]\n"
        L"ReconcileMBLayout(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileMBLayout' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]\n"
-        L"Convolution(weightNode, inputValueNode, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode) /*plus the function args*/ ]\n"
-        L"MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]\n"
-        L"AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, tag='') = new ComputationNode [ operation = 'AveragePoolingNode' ; inputs = input /*plus the function args*/ ]\n"
+        L"Convolution(weightNode, inputValueNode, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode) /*plus the function args*/ ]\n"
+        L"MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]\n"
+        L"AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'AveragePooling' ; inputs = input /*plus the function args*/ ]\n"
        // TODO: define DelayedValue, with negative delay for future; cannot do this yet, need to be able to say something like delay = -(^.delay)
        // aliases
        L"ColumnwiseCrossProduct = KhatriRaoProduct // deprecated \n"   // TODO: should it be deprecated? It is described as easier to understand in the CNTKBook.
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -903,12 +903,12 @@ void DoTrain(const ConfigRecordType & config)
        };
    }
    // legacy test mode for BrainScript. Will go away once we fully integrate with BS.
-    else if (config.Exists(L"ExperimentalNetworkBuilder"))
+    else if (config.Exists(L"BrainScriptNetworkBuilder") || config.Exists(L"ExperimentalNetworkBuilder"/*legacy*/))
    {
        // We interface with outer old CNTK config by taking the inner part, which we get as a string, as BrainScript.
        // We prepend a few standard definitions, and also definition of deviceId and precision, which all objects will pull out again when they are being constructed.
        // BUGBUG: We are not getting TextLocations right in this way! Do we need to inject location markers into the source? Moot once we fully switch to BS
-        wstring sourceCode = config(L"ExperimentalNetworkBuilder");
+        wstring sourceCode = config.Exists(L"BrainScriptNetworkBuilder") ? config(L"BrainScriptNetworkBuilder") : config(L"ExperimentalNetworkBuilder");
        let expr = BS::ParseConfigDictFromString(standardFunctions + computationNodes + commonMacros
            + msra::strfun::wstrprintf(L"deviceId = %d ; precision = '%ls' ; network = new ComputationNetwork ", (int)deviceId, ElemTypeName<ElemType>())  // TODO: check if typeid needs postprocessing
            + sourceCode, vector<wstring>());    // source code has the form [ ... ]
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@ -158,7 +158,7 @@
    <ClInclude Include="..\Common\Include\Basics.h" />
    <ClInclude Include="..\Common\Include\BestGpu.h" />
    <ClInclude Include="..\Common\Include\DataReader.h" />
-    <ClInclude Include="..\Common\Include\DataTensor.h" />
+    <ClInclude Include="..\Common\Include\TensorShape.h" />
    <ClInclude Include="..\Common\Include\DataWriter.h" />
    <ClInclude Include="..\Common\Include\File.h" />
    <ClInclude Include="..\Common\Include\fileutil.h" />
--- a/Source/CNTK/CNTK.vcxproj.filters
+++ b/Source/CNTK/CNTK.vcxproj.filters
@ -133,7 +133,7 @@
    <ClInclude Include="..\Common\Include\Sequences.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
-    <ClInclude Include="..\Common\Include\DataTensor.h">
+    <ClInclude Include="..\Common\Include\TensorShape.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
    <ClInclude Include="..\Common\Include\ProgressTracing.h">
--- a/Source/CNTK/NetworkDescriptionLanguage.cpp
+++ b/Source/CNTK/NetworkDescriptionLanguage.cpp
@ -154,6 +154,8 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
        ret = true; 
    else if (EqualInsensitive(nodeType, OperationNameOf(LearnableParameter), L"Parameter"))
        ret = true;   
+    else if (EqualInsensitive(nodeType, L"ImageParameter"))
+        ret = true;
    //else if (EqualInsensitive(nodeType, OperationNameOf(SparseLearnableParameter), L"SparseParameter"))
    //    ret = true;  
    else if (EqualInsensitive(nodeType, L"Constant", L"Const"))
--- a/Source/CNTK/SimpleNetworkBuilder.cpp
+++ b/Source/CNTK/SimpleNetworkBuilder.cpp
@ -30,29 +30,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        case SIMPLENET:
            net = BuildSimpleDNN(); break;
        case SIMPLERNN:
-            net = BuildSimpleRNN(1); break;
+            net = BuildSimpleRNN(); break;
        case LSTM:
-            net = BuildLSTMNetworkFromDescription(1); break;
+            net = BuildLSTMNetworkFromDescription(); break;
        case CLASSLSTM:
-            net = BuildCLASSLSTMNetworkFromDescription(1); break;
+            net = BuildCLASSLSTMNetworkFromDescription(); break;
        case NCELSTM:
-            net = BuildNCELSTMNetworkFromDescription(1); break;
+            net = BuildNCELSTMNetworkFromDescription(); break;
        case CLASSLM:
-            net = BuildClassEntropyNetwork(1); break;
+            net = BuildClassEntropyNetwork(); break;
        case LBLM:
-            net = BuildLogBilinearNetworkFromDescription(1); break;
+            net = BuildLogBilinearNetworkFromDescription(); break;
        case NPLM:
-            net = BuildNeuralProbNetworkFromDescription(1); break;
+            net = BuildNeuralProbNetworkFromDescription(); break;
        case CLSTM:
-            net = BuildConditionalLSTMNetworkFromDescription(1); break;
+            net = BuildConditionalLSTMNetworkFromDescription(); break;
        case RCRF:
-            net = BuildSeqTrnLSTMNetworkFromDescription(1); break;
+            net = BuildSeqTrnLSTMNetworkFromDescription(); break;
        case LSTMENCODER:
-            net = BuildLSTMEncoderNetworkFromDescription(1); break;
+            net = BuildLSTMEncoderNetworkFromDescription(); break;
        case UNIDIRECTIONALLSTM:
-            net = BuildUnidirectionalLSTMNetworksFromDescription(1); break;
+            net = BuildUnidirectionalLSTMNetworksFromDescription(); break;
        case BIDIRECTIONALLSTM:
-            net = BuildBiDirectionalLSTMNetworksFromDescription(1); break;
+            net = BuildBiDirectionalLSTMNetworksFromDescription(); break;
        default:
            LogicError("BuildNetworkFromDescription: invalid m_rnnType %d", (int)m_rnnType);
        }
@ -75,11 +75,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        switch (m_rnnType)
        {
        case ALIGNMENTSIMILARITYGENERATOR:
-            net = BuildAlignmentDecoderNetworkFromDescription(encoderNet, 1); 
+            net = BuildAlignmentDecoderNetworkFromDescription(encoderNet); 
            net->CompileNetwork();
            return net;
        case ALIGNMENTSIMILARITYGFORWARDDECODER:
-            net = BuildAlignmentForwardDecoderNetworkFromDescription(encoderNet, 1);
+            net = BuildAlignmentForwardDecoderNetworkFromDescription(encoderNet);
            net->CompileNetwork();
            return net;
        }
@ -95,12 +95,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            unsigned long randomSeed = 1;

-            size_t mbSize = 3; //this is not the actual minibatch size. only used in the validataion process
-
            size_t numHiddenLayers = m_layerSizes.size() - 2;
            ComputationNodePtr input, w, b, output, label, prior, scaledLogLikelihood;

-            input = builder.Input(m_layerSizes[0], mbSize, L"features");
+            input = builder.CreateInputNode(L"features", m_layerSizes[0]);
            m_net->FeatureNodes().push_back(input);

            if (m_applyMeanVarNorm)
@ -114,9 +112,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            if (numHiddenLayers > 0)
            {
-                w = builder.Parameter(m_layerSizes[1], m_layerSizes[0], L"W0");
+                w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[0]);
                m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
-                b = builder.Parameter(m_layerSizes[1], 1, L"B0");
+                b = builder.CreateLearnableParameter(L"B0", m_layerSizes[1], 1);
                output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, L"W0*features"), b, L"W0*features+B0"), 0, L"H1");

                if (m_addDropoutNodes)
@ -133,9 +131,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
                    wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);

-                    w = builder.Parameter(m_layerSizes[i + 1], m_layerSizes[i], nameOfW);
+                    w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[i + 1], m_layerSizes[i]);
                    m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
-                    b = builder.Parameter(m_layerSizes[i + 1], 1, nameOfB);
+                    b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[i + 1], 1);
                    output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, nameOfTimes), b, nameOfPlus), i, nameOfH);

                    if (m_addDropoutNodes)
@ -151,13 +149,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            wstring nameOfTimes = nameOfW + L"*" + nameOfPrevH;
            wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;

-            w = builder.Parameter(m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers], nameOfW);
+            w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
            m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
-            b = builder.Parameter(m_layerSizes[numHiddenLayers + 1], 1, nameOfB);
+            b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[numHiddenLayers + 1], 1);
            output = builder.Plus(builder.Times(w, input, nameOfTimes), b, nameOfPlus);
            m_net->RenameNode(output, L"HLast");

-            label = builder.Input(m_layerSizes[numHiddenLayers + 1], mbSize, L"labels");
+            label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);

            AddTrainAndEvalCriterionNodes(output, label);

@ -188,7 +186,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    // Note: while ComputationNode and CompuationNetwork are (supposed to be) independent of ElemType, it is OK to keep this class dependent.
    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSimpleRNN(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSimpleRNN()
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);
        if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -201,7 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            ComputationNodePtr input, w, b, u, pastValue, output, label, prior;

-            input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
+            input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
            m_net->FeatureNodes().push_back(input);

            if (m_applyMeanVarNorm)
@ -225,7 +223,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
                    m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

-                    pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], mbSize, 1); 
+                    pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1); 
                    /// unless there is a good algorithm to detect loops, use this explicit setup
                    output = ApplyNonlinearFunction(
                        builder.Plus(
@ -255,7 +253,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", i), m_layerSizes[i+1], m_layerSizes[i+1]);
                        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

-                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], mbSize, 1);
+                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], 1);
                        /// unless there is a good algorithm to detect loops, use this explicit setup
                        output = ApplyNonlinearFunction(
                            builder.Plus(
@ -279,7 +277,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
            /*m_net->MatrixL2Reg(w , L"L1w");*/

-            label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize);
+            label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1]);
            AddTrainAndEvalCriterionNodes(input, label, w, L"criterion", L"eval");

            output = builder.Times(w, input, L"outputs");   
@ -294,7 +292,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyNetwork(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyNetwork()
    {
            ComputationNetworkBuilder<ElemType> builder(*m_net);

@ -312,7 +310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (m_vocabSize != m_layerSizes[numHiddenLayers + 1])
                    RuntimeError("BuildClassEntropyNetwork : vocabulary size should be the same as the output layer size");

-                input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
+                input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
                m_net->FeatureNodes().push_back(input);

                if (m_applyMeanVarNorm)
@ -335,7 +333,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
                        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

-                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], mbSize, 1); 
+                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1); 
                        /// unless there is a good algorithm to detect loops, use this explicit setup
                        output = ApplyNonlinearFunction(
                            builder.Plus(
@ -364,7 +362,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                            w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", i), m_layerSizes[i+1], m_layerSizes[i+1]);
                            m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

-                            pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], mbSize, 1); 
+                            pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i+1], 1); 
                            /// unless there is a good algorithm to detect loops, use this explicit setup
                            output = ApplyNonlinearFunction(
                                builder.Plus(
@ -391,7 +389,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

                /// the label is a dense matrix. each element is the word index
-                label = builder.CreateInputNode(L"labels", 4, mbSize);
+                label = builder.CreateInputNode(L"labels", 4);

                clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
                m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
@ -412,7 +410,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetworkFromDescription(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetworkFromDescription()
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);
        if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -428,7 +426,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ComputationNodePtr clslogpostprob;
            ComputationNodePtr clsweight;

-            input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
+            input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
            m_net->FeatureNodes().push_back(input);

            if (m_applyMeanVarNorm)
@ -461,13 +459,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (numHiddenLayers > 0)
            {
                //           output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
-                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
+                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
                /// previously used function. now uses LSTMNode which is correct and fast
                input = output;
                for (int i = 1 + offset; i < numHiddenLayers; i++)
                {
                    //                    output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
-                    output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
+                    output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);

                    if (m_addDropoutNodes)
                        input = builder.Dropout(output);
@ -477,7 +475,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            /// serve as a global bias term
-            gt = builder.CreateInputNode(L"binaryFeature", m_auxFeatDim, 1);
+            gt = builder.CreateInputNode(L"binaryFeature", m_auxFeatDim);
            m_net->FeatureNodes().push_back(gt);
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"AuxTrans%d", 0),
                m_layerSizes[numHiddenLayers], m_auxFeatDim);
@ -493,7 +491,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

            /// the label is a dense matrix. each element is the word index
-            label = builder.CreateInputNode(L"labels", 4, mbSize);
+            label = builder.CreateInputNode(L"labels", 4);

            clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
            m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
@ -518,7 +516,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    the aligment node takes a variable length input and relates each element to a variable length output
    */
    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet)
    {
                ComputationNetworkBuilder<ElemType> builder(*m_net);
                if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -535,7 +533,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    ComputationNodePtr clsweight;
                    ComputationNodePtr columnStride, rowStride;

-                    input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
+                    input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
                    m_net->FeatureNodes().push_back(input);

                    if (m_lookupTableOrder > 0)
@ -577,9 +575,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i], m_layerSizes[i]);
                        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

-                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], mbSize, 1);
+                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], 1);
                        //                output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
-                        //                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
+                        //                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);

                        /// alignment node to get weights from source to target
                        /// this aligment node computes weights of the current hidden state after special encoder ending symbol to all 
@ -607,7 +605,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        for (; i < numHiddenLayers; i++)
                        {
                            //output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
-                            output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
+                            output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);

                            if (m_addDropoutNodes)
                                input = builder.Dropout(output);
@ -625,7 +623,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

                    /// the label is a dense matrix. each element is the word index
-                    label = builder.CreateInputNode(L"labels", 4, mbSize);
+                    label = builder.CreateInputNode(L"labels", 4);

                    clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
                    m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
@ -645,7 +643,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet)
    {
                ComputationNetworkBuilder<ElemType> builder(*m_net);
        if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -662,7 +660,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    ComputationNodePtr clsweight;
                    ComputationNodePtr columnStride, rowStride;

-                    input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
+                    input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
                    m_net->FeatureNodes().push_back(input);

                    if (m_lookupTableOrder > 0)
@ -704,9 +702,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i], m_layerSizes[i]);
                        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

-                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], mbSize, 1);
+                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t)m_layerSizes[i], 1);
                        //                output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
-                        //                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
+                        //                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);

                        /// alignment node to get weights from source to target
                        /// this aligment node computes weights of the current hidden state after special encoder ending symbol to all 
@ -734,7 +732,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        for (; i < numHiddenLayers; i++)
                        {
                            //output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
-                            output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
+                            output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);

                            if (m_addDropoutNodes)
                                input = builder.Dropout(output);
@ -752,7 +750,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

                    /// the label is a dense matrix. each element is the word index
-                    label = builder.CreateInputNode(L"labels", 4, mbSize);
+                    label = builder.CreateInputNode(L"labels", 4);

                    clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
                    m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
@ -775,7 +773,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFromDescription(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFromDescription()
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);
        if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -793,8 +791,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                ComputationNodePtr ot=nullptr, it=nullptr, ft=nullptr, gt=nullptr, ct=nullptr, ht=nullptr;
                ComputationNodePtr pastValueXI, pastValueXII, pastValueXIII, pastValueXIV;

-//                input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
-                input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
+//                input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
+                input = builder.CreateInputNode(L"features", m_layerSizes[0]);
                featin = input;
                m_net->FeatureNodes().push_back(input);

@ -827,7 +825,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                while (ik <= m_maOrder)
                {
                    pastValueXI = 
-                        builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, ik, msra::strfun::wstrprintf(L"pastValue%d", ik)); 
+                        builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], ik, msra::strfun::wstrprintf(L"pastValue%d", ik)); 
                    pastValueXI->SetParameterUpdateRequired(false);
                    pastValueXI->AttachInputs(input);
                    //TODO: to figure out sparse matrix size
@ -855,7 +853,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    {
                        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"R%d", i+1), m_layerSizes[i+1], m_layerSizes[i+1]);
                        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
-                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], mbSize, 1);
+                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], 1);
                        output = builder.Plus(builder.Times(w, pastValue), input);

                        pastValue->AttachInputs(output);
@ -875,7 +873,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], m_layerSizes[numHiddenLayers]);
                m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

-                label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize);
+                label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers+1]);
                AddTrainAndEvalCriterionNodes(input, label, w);
                
                output = builder.Times(w, input, L"outputs");   
@ -892,7 +890,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNeuralProbNetworkFromDescription(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNeuralProbNetworkFromDescription()
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);
        if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -910,7 +908,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ComputationNodePtr ot = nullptr, it = nullptr, ft = nullptr, gt = nullptr, ct = nullptr, ht = nullptr;
            ComputationNodePtr pastValueXI, pastValueXII, pastValueXIII, pastValueXIV;

-            input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
+            input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
            m_net->FeatureNodes().push_back(input);

            if (m_applyMeanVarNorm)
@ -927,10 +925,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                bi = builder.CreateLearnableParameter(L"bi0", m_layerSizes[1], 1);

-                pastValueXI = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 1);
-                pastValueXII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 2);
-                pastValueXIII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 3);
-                pastValueXIV = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], mbSize, 4);
+                pastValueXI = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 1);
+                pastValueXII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 2);
+                pastValueXIII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 3);
+                pastValueXIV = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 4);
                pastValueXI->AttachInputs(input);
                pastValueXII->AttachInputs(input);
                pastValueXIII->AttachInputs(input);
@ -996,7 +994,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", i), m_layerSizes[i+1], m_layerSizes[i+1]);
                        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
                        std::list<ComputationNodeBasePtr> recurrent_loop;
-                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], mbSize, 1);
+                        pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i+1], 1);
                        output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), builder.Times(w, pastValue)), i);
                        pastValue->AttachInputs(output);
                        recur_idx++;
@ -1017,7 +1015,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            w = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], m_layerSizes[numHiddenLayers]);
            m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
            //                b = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"B%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], 1);
-            label = builder.CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers+1], mbSize);
+            label = builder.CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers+1]);
            AddTrainAndEvalCriterionNodes(input, label, w);

            output = builder.Times(w, input);
@ -1034,7 +1032,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildDirectConnect(unsigned long &randomSeed, size_t /*mbSize*/, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode)
+    shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildDirectConnect(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode)
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);

@ -1050,7 +1048,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                ComputationNodePtr scalar = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"SV%d", i), 1, 1);
                scalar->Value().SetValue((ElemType)0.01);
-#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
+#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
                ComputationNodePtr scaled = builder.Scale(scalar, directOutput, msra::strfun::wstrprintf(L"S%d", i));
 #else
                ComputationNodePtr scaled = builder.ElementTimes(scalar, directOutput, msra::strfun::wstrprintf(L"S%d", i));
@ -1065,7 +1063,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {


    template<class ElemType>
-    shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs)
+    shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponent(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr inputObs)
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);

@ -1121,17 +1119,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        size_t layer1 = outputDim;
        
-        pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
+        pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
        
        if(m_constInputGateValue)
        {
-            //it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim, mbSize);
+            //it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim);
            //it->SetParameterUpdateRequired(false);
            //it->Value().SetValue(m_constInputGateValue);
            it = nullptr;
@ -1241,7 +1239,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSeqTrnLSTMNetworkFromDescription(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildSeqTrnLSTMNetworkFromDescription()
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);
        if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -1261,7 +1259,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = { nullptr };
            ComputationNodePtr trans;

-            input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
+            input = builder.CreateInputNode(L"features", m_layerSizes[0]);
            m_net->FeatureNodes().push_back(input);

            if (m_applyMeanVarNorm)
@ -1297,7 +1295,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                {
                    if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i+1)
                    {
-                        output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i] * (offset ? m_lookupTableOrder : 1), m_layerSizes[i + 1], input);
+                        output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i] * (offset ? m_lookupTableOrder : 1), m_layerSizes[i + 1], input);
                        input = output;
 
                        recur_idx++;
@ -1326,7 +1324,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            trans->Value().SetValue((ElemType)1.0 / m_layerSizes[numHiddenLayers + 1]);
 //          m_net->InitLearnableParameters(trans, m_uniformInit, randomSeed++, m_initValueScale);
            trans->SetParameterUpdateRequired(true);
-            label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
+            label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
            AddTrainAndEvalCriterionNodes(output, label, nullptr, L"CRFTrainCriterion", L"CRFEvalCriterion", nullptr, trans);

            input = output;
@ -1340,7 +1338,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCLASSLSTMNetworkFromDescription(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCLASSLSTMNetworkFromDescription()
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);
        if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -1356,7 +1354,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ComputationNodePtr clslogpostprob;
            ComputationNodePtr clsweight;

-            input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
+            input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
            m_net->FeatureNodes().push_back(input);

            if (m_applyMeanVarNorm)
@ -1389,13 +1387,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (numHiddenLayers > 0)
            {
 //                output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
-                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
+                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
                /// previously used function. now uses LSTMNode which is correct and fast
                input = output;
                for (int i = 1 + offset; i <numHiddenLayers; i++)
                {
 //                    output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
-                    output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
+                    output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
                    
                    if (m_addDropoutNodes)
                        input = builder.Dropout(output);
@ -1411,7 +1409,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

            /// the label is a dense matrix. each element is the word index
-            label = builder.CreateInputNode(L"labels", 4, mbSize);
+            label = builder.CreateInputNode(L"labels", 4);

            clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
            m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
@ -1482,7 +1480,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #endif

    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescription(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescription()
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);
        if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -1502,9 +1500,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = { nullptr };

            if (m_sparse_input)
-                input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
+                input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
            else
-                input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
+                input = builder.CreateInputNode(L"features", m_layerSizes[0]);

            m_net->FeatureNodes().push_back(input);

@ -1542,7 +1540,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {

                //output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
-                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
+                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
                /// previously used function. now uses LSTMNode which is correct and fast
                input = output;
                outputFromEachLayer[offset + 1] = input;
@ -1553,7 +1551,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    {

                        //output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
-                        output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
+                        output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
                        // previously used function, now uses LSTMnode, which is fast and correct

                        recur_idx++;
@ -1580,7 +1578,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #ifdef DEBUG_DECODER
            w->Value().SetValue((ElemType)0.01);
 #endif
-            label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
+            label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
            AddTrainAndEvalCriterionNodes(input, label, w);

            output = builder.Times(w, input, L"outputs");
@ -1615,7 +1613,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    K. Yao, G. Zweig, "Sequence-to-sequence neural net models for grapheme-to-phoneme conversion, submitted to Interspeech 2015
    */
    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMEncoderNetworkFromDescription(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMEncoderNetworkFromDescription()
    {

        ComputationNetworkBuilder<ElemType> builder(*m_net);
@ -1631,9 +1629,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ComputationNodePtr input, w, b, u, e, pastValue, output, label, prior;

            if (m_sparse_input)
-                input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
+                input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
            else
-                input = builder.CreateInputNode(L"features", m_layerSizes[0], mbSize);
+                input = builder.CreateInputNode(L"features", m_layerSizes[0]);

            m_net->FeatureNodes().push_back(input);

@ -1669,14 +1667,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (numHiddenLayers > 0)
            {
                //output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
-                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
+                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
                input = output;
                i++;

                for (; i<numHiddenLayers; i++)
                {
                    //output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);
-                    output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
+                    output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);

                    if (m_addDropoutNodes)
                        input = builder.Dropout(output);
@ -1705,7 +1703,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    K. Yao, G. Zweig, "Sequence-to-sequence neural net models for grapheme-to-phoneme conversion" submitted to Interspeech 2015
    */
    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildUnidirectionalLSTMNetworksFromDescription(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildUnidirectionalLSTMNetworksFromDescription()
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);
        if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -1726,11 +1724,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            map<wstring, size_t> featDim;

            assert(m_streamSizes.size() > 0);
-            inputbackward = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0], mbSize);
+            inputbackward = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0]);
            m_net->FeatureNodes().push_back(inputbackward);
            featDim[L"featurepastValueedTarget"] = m_streamSizes[0];

-            inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1], mbSize);
+            inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1]);
            m_net->FeatureNodes().push_back(inputletter);
            featDim[L"ltrForward"] = m_streamSizes[1];

@ -1777,7 +1775,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    switch (m_rnnType){
                    case UNIDIRECTIONALLSTM:
                        //output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx, dims, m_layerSizes[layerIdx + 1], input);
-                        output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx, dims, m_layerSizes[layerIdx + 1], input);
+                        output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx, dims, m_layerSizes[layerIdx + 1], input);
                        break;
                    default:
                        LogicError("This is for unidorectional LSTM model. Check rnntype to see whether it is UNIDIRECTIONALLSTMWITHPASTPREDICTION or TRANSDUCER");
@ -1797,7 +1795,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            input = output;

            /// here uses "labels", so only one label from multiple stream inputs are used.
-            label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
+            label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);

            AddTrainAndEvalCriterionNodes(input, label, w);

@ -1819,7 +1817,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t mbSize, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse)
+    shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilder<ElemType>::BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse)
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);

@ -1896,17 +1894,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        size_t layer1 = outputDim;

-        pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
-        pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, mbSize, 1);
+        pastValueHI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueHF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueHO = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueHC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueCI = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueCF = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);
+        pastValueCC = builder.PastValue(NULL, m_defaultHiddenActivity, layer1, 1);

        if (m_constInputGateValue)
        {
-            //it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim, mbSize);
+            //it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim);
            //it->SetParameterUpdateRequired(false);
            //it->Value().SetValue(m_constInputGateValue);
            it = nullptr;
@ -2026,7 +2024,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    K. Yao, G. Zweig, "Sequence-to-sequence neural net models for grapheme-to-phoneme conversion, submitted to Interspeech 2015
    */
    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildBiDirectionalLSTMNetworksFromDescription(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildBiDirectionalLSTMNetworksFromDescription()
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);
        if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -2049,10 +2047,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            size_t ltrSrcIdx = 1;
            /// create projections to use pastValue predictions
-            inputprediction = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0], mbSize);
+            inputprediction = builder.CreateInputNode(L"featurepastValueedTarget", m_streamSizes[0]);
            m_net->FeatureNodes().push_back(inputprediction);

-            inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1], mbSize);
+            inputletter = builder.CreateInputNode(L"ltrForward", m_streamSizes[1]);
            m_net->FeatureNodes().push_back(inputletter);
            featDim[L"ltrForward"] = m_streamSizes[1];

@ -2100,12 +2098,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                /// forward direction
                //forwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 100, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
-                forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 100, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
+                forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 100, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
                forwardInput = forwardOutput;

                backwardInput = (ComputationNodePtr)builder.TimeReverse(ltrSource);
                //backwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 200, ltrDim, m_layerSizes[layerIdx + 1], backwardInput);
-                backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 200, ltrDim, m_layerSizes[layerIdx + 1], backwardInput);
+                backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 200, ltrDim, m_layerSizes[layerIdx + 1], backwardInput);
                backwardInput = backwardOutput;

                layerIdx++;
@ -2113,11 +2111,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                while (layerIdx < numHiddenLayers - 1)
                {
                    //forwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 100, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], forwardInput);
-                    forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 100, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], forwardInput);
+                    forwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 100, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], forwardInput);
                    forwardInput = forwardOutput;

                    //backwardOutput = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx + 200, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], backwardInput);
-                    backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx + 200, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], backwardInput);
+                    backwardOutput = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx + 200, m_layerSizes[layerIdx], m_layerSizes[layerIdx + 1], backwardInput);
                    backwardInput = backwardOutput;

                    layerIdx++;
@ -2137,7 +2135,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            forwardInput = (ComputationNodePtr)builder.Parallel(streams[0], streams[1], L"Parallel1");

 //                    output = (ComputationNodePtr)BuildLSTMNodeComponent(randomSeed, layerIdx, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
-                    output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, layerIdx, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);
+                    output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, layerIdx, streamdims[0] + streamdims[1], m_layerSizes[layerIdx + 1], forwardInput);

            input = output;
            layerIdx++;
@ -2150,7 +2148,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            input = output;

            /// here uses "labels", so only one label from multiple stream inputs are used.
-            label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1], mbSize);
+            label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);

            AddTrainAndEvalCriterionNodes(input, label);

@ -2174,7 +2172,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDescription(size_t mbSize)
+    ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDescription()
    {
        ComputationNetworkBuilder<ElemType> builder(*m_net);
        if (m_net->GetTotalNumberOfNodes() < 1) //not built yet
@ -2190,7 +2188,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ComputationNodePtr bias;
            ComputationNodePtr outputFromEachLayer[MAX_DEPTH] = { nullptr };

-            input = builder.CreateSparseInputNode(L"features", m_layerSizes[0], mbSize);
+            input = builder.CreateSparseInputNode(L"features", m_layerSizes[0]);
            m_net->FeatureNodes().push_back(input);

            if (m_applyMeanVarNorm)
@ -2222,7 +2220,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            int offset = m_lookupTableOrder > 0 ? 1 : 0;
            if (numHiddenLayers > 0)
            {
-                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
+                output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, 0, m_layerSizes[offset] * (offset ? m_lookupTableOrder : 1), m_layerSizes[offset + 1], input);
                input = output;
                outputFromEachLayer[offset + 1] = input;

@ -2230,7 +2228,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                {
                    if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i)
                    {
-                        output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, mbSize, i, m_layerSizes[i], m_layerSizes[i + 1], input);
+                        output = (ComputationNodePtr)BuildLSTMComponent(randomSeed, i, m_layerSizes[i], m_layerSizes[i + 1], input);

                        recur_idx++;
                    }
@ -2254,7 +2252,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (size_t i = offset; i < m_layerSizes.size(); i++)
            {
                /// add direct connect from each layers' output to the layer before the output layer
-                output = BuildDirectConnect(randomSeed, mbSize, i, (i > 1) ? m_layerSizes[i] : ((offset == 0) ? m_layerSizes[i] : m_layerSizes[i] * m_lookupTableOrder), m_layerSizes[numHiddenLayers], outputFromEachLayer[i], input);
+                output = BuildDirectConnect(randomSeed, i, (i > 1) ? m_layerSizes[i] : ((offset == 0) ? m_layerSizes[i] : m_layerSizes[i] * m_lookupTableOrder), m_layerSizes[numHiddenLayers], outputFromEachLayer[i], input);
                if (output != nullptr)
                    input = output;
            }
@ -2266,7 +2264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

            /// the label is a dense matrix. each element is the word index
-            label = builder.CreateInputNode(L"labels", 2 * (this->nce_noises + 1), mbSize);
+            label = builder.CreateInputNode(L"labels", 2 * (this->nce_noises + 1));

            bias = builder.CreateLearnableParameter(L"BiasVector", 1, m_layerSizes[m_layerSizes.size() - 1]);
            bias->Value().SetValue((ElemType)-std::log(m_layerSizes[m_layerSizes.size() - 1]));
@ -2301,7 +2299,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        ComputationNodePtr input, w, b, output, label, prior, scaledLogLikelihood;
        shared_ptr<PreComputedNode<ElemType>> pcNodePtr;
-        size_t mbSize = 3; //this is not the actual minibatch size. only used in the validataion process

        File fstream(dbnModelFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);

@ -2336,7 +2333,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Matrix<ElemType> A = ReadMatrixFromDbnFile(fstream, std::string("b"));
            if (i == 0)
            {
-                input = builder.Input(wts.GetNumCols(), mbSize, L"features");
+                input = builder.CreateInputNode(L"features", wts.GetNumCols());
                m_net->FeatureNodes().push_back(input);

                size_t frameDim = globalMean.GetNumRows();
@ -2381,10 +2378,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
            wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);

-            w = builder.Parameter(wts.GetNumRows(), wts.GetNumCols(), nameOfW);
+            w = builder.CreateLearnableParameter(nameOfW, wts.GetNumRows(), wts.GetNumCols());
            w->Value().SetValue(wts);

-            b = builder.Parameter(bias.GetNumRows(), 1, nameOfB);
+            b = builder.CreateLearnableParameter(nameOfB, bias.GetNumRows(), 1);
            b->Value().SetValue(bias);

            if (layerType == "perceptron")
@ -2412,7 +2409,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            RuntimeError("Error reading DBN file - did not find expected tag ENET\n");
        //size_t outputLayerSize =  m_layerSizes[m_layerSizes.size()-1];

-        label = builder.Input(m_outputLayerSize, mbSize, L"labels");
+        label = builder.CreateInputNode(L"labels", m_outputLayerSize);

        if (layerType == "perceptron") // complete network
        {
@ -2446,9 +2443,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;
            wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);

-            w = builder.Parameter(outputLayerSize, penultimateSize, nameOfW);
+            w = builder.CreateLearnableParameter(nameOfW, outputLayerSize, penultimateSize);
            m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
-            b = builder.Parameter(outputLayerSize, 1, nameOfB);
+            b = builder.CreateLearnableParameter(nameOfB, outputLayerSize, 1);
            output = builder.Plus(builder.Times(w, input, nameOfTimes), b, nameOfPlus);
            m_net->RenameNode(output, L"HLast");

--- a/Source/CNTK/SimpleNetworkBuilder.h
+++ b/Source/CNTK/SimpleNetworkBuilder.h
@ -256,41 +256,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        ComputationNetworkPtr BuildSimpleDNN();

-        ComputationNetworkPtr BuildSimpleRNN(size_t mbSize = 1);
+        ComputationNetworkPtr BuildSimpleRNN();

-        ComputationNetworkPtr BuildClassEntropyNetwork(size_t mbSize = 1);
+        ComputationNetworkPtr BuildClassEntropyNetwork();

-        ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);
+        ComputationNodePtr BuildLSTMComponent(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);

        ComputationNodePtr BuildLSTMNodeComponent(ULONG &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input);

-        ComputationNodePtr BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t mbSize, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse = false);
+        ComputationNodePtr BuildLSTMComponentWithMultiInputs(ULONG &randomSeed, size_t iLayer, const vector<size_t>& inputDim, size_t outputDim, const vector<ComputationNodePtr>& inputObs, bool inputWeightSparse = false);

-        ComputationNodePtr BuildDirectConnect(unsigned long &randomSeed, size_t mbSize, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode);
+        ComputationNodePtr BuildDirectConnect(unsigned long &randomSeed, size_t iLayer, size_t inputDim, size_t outputDim, ComputationNodePtr input, ComputationNodePtr toNode);

-        ComputationNetworkPtr BuildLogBilinearNetworkFromDescription(size_t mbSize = 1);
+        ComputationNetworkPtr BuildLogBilinearNetworkFromDescription();

-        ComputationNetworkPtr BuildNeuralProbNetworkFromDescription(size_t mbSize = 1);
+        ComputationNetworkPtr BuildNeuralProbNetworkFromDescription();

-        ComputationNetworkPtr BuildLSTMNetworkFromDescription(size_t mbSize = 1);
+        ComputationNetworkPtr BuildLSTMNetworkFromDescription();

-        ComputationNetworkPtr BuildSeqTrnLSTMNetworkFromDescription(size_t mbSize = 1);
+        ComputationNetworkPtr BuildSeqTrnLSTMNetworkFromDescription();

-        ComputationNetworkPtr BuildLSTMEncoderNetworkFromDescription(size_t mbSize = 1);
+        ComputationNetworkPtr BuildLSTMEncoderNetworkFromDescription();

-        ComputationNetworkPtr BuildUnidirectionalLSTMNetworksFromDescription(size_t mbSize = 1);
+        ComputationNetworkPtr BuildUnidirectionalLSTMNetworksFromDescription();

-        ComputationNetworkPtr BuildBiDirectionalLSTMNetworksFromDescription(size_t mbSize = 1);
+        ComputationNetworkPtr BuildBiDirectionalLSTMNetworksFromDescription();

-        ComputationNetworkPtr BuildCLASSLSTMNetworkFromDescription(size_t mbSize = 1);
+        ComputationNetworkPtr BuildCLASSLSTMNetworkFromDescription();

-        ComputationNetworkPtr BuildConditionalLSTMNetworkFromDescription(size_t mbSize = 1);
+        ComputationNetworkPtr BuildConditionalLSTMNetworkFromDescription();

-        ComputationNetworkPtr BuildNCELSTMNetworkFromDescription(size_t mbSize = 1);
+        ComputationNetworkPtr BuildNCELSTMNetworkFromDescription();

-        ComputationNetworkPtr BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize = 1);
+        ComputationNetworkPtr BuildAlignmentForwardDecoderNetworkFromDescription(ComputationNetwork* encoderNet);

-        ComputationNetworkPtr BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet, size_t mbSize = 1);
+        ComputationNetworkPtr BuildAlignmentDecoderNetworkFromDescription(ComputationNetwork* encoderNet);

        //layer is 0 based
        ComputationNodePtr ApplyNonlinearFunction(ComputationNodePtr input, const size_t layer, const std::wstring nodeName = L"");
--- a/Source/CNTK/SynchronousExecutionEngine.cpp
+++ b/Source/CNTK/SynchronousExecutionEngine.cpp
@ -15,9 +15,12 @@
 #include "ConvolutionalNodes.h"
 #include "NonlinearityNodes.h"
 #include "ReshapingNodes.h"
+#include "TensorShape.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

+    using namespace std;
+
    template<class ElemType>
    void SynchronousNodeEvaluator<ElemType>::Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass)
    {
@ -58,48 +61,34 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
        }
        
-        if (OperationNameOf(InputValue) == cnNodeType)
+        if (OperationNameOf(InputValue) == cnNodeType || OperationNameOf(SparseInputValue) == cnNodeType)
        {
-            if (parameter.size() < 1 || parameter.size() > 2)
-                RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
+            bool isSparse = (OperationNameOf(SparseInputValue) == cnNodeType);
+            if (parameter.size() < 1)
+                RuntimeError("%ls should have 1 or more parameters (tensor dimensions, e.g. [vecdim] or [rows, cols]).", cnNodeType.c_str());

            if (pass == ndlPassInitial)
            {
                // evaluate only scalar parameters
                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+                size_t i = 0;
+                auto tensorShape = ProcessTensorShapeParameters(node, params, i, /*isImage=*/false, cnNodeType);

                // first look for this node already existing in the network
+                // BUGBUG: How does this set the dimensions then?
                if (m_net->NodeNameExists(name))
                    nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
+                else if (isSparse)
+                    nodePtr = builder.CreateSparseInputNode(name, tensorShape);
                else
-                    nodePtr = builder.CreateInputNode(name, rows, cols);
+                    nodePtr = builder.CreateInputNode      (name, tensorShape);
            }
        }
-        else if (OperationNameOf(SparseInputValue) == cnNodeType)
+        else if (cnNodeType == L"ImageInput" || cnNodeType == L"SparseImageInput")
        {
-            if (parameter.size() < 1 || parameter.size() > 2)
-                RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]].", cnNodeType.c_str());
-
-            if (pass == ndlPassInitial)
-            {
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
-                // first look for this node already existing in the network
-                if (m_net->NodeNameExists(name))
-                    nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
-                else
-                    nodePtr = builder.CreateSparseInputNode(name, rows, cols);
-            }
-        }
-        else if (cnNodeType == L"ImageInput")
-        {
-            if (parameter.size() < 3 || parameter.size() > 4)
-                RuntimeError("%ls should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
+            bool isSparse = (cnNodeType == L"SparseImageInput");
+            if (parameter.size() < 3 || parameter.size() > 4)   // we allow 4 for legacy (numImages, was ignored)
+                RuntimeError("%ls should have 3 parameters[imageWidth, imageHeight, imageChannels].", cnNodeType.c_str());

            if (pass == ndlPassInitial)
            {
@ -108,44 +97,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
                size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
                size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
-                size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
+                ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));

-                nodePtr = builder.CreateInputNode(name, ImageLayoutWHC(imageWidth, imageHeight, imageChannels), numImages);
+                if (isSparse)
+                    nodePtr = builder.CreateSparseInputNode(name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind));
+                else
+                    nodePtr = builder.CreateInputNode      (name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind));
            }
        }
-        else if (cnNodeType == L"SparseImageInput")
+        else if (OperationNameOf(LearnableParameter) == cnNodeType || cnNodeType == L"ImageParameter")
        {
-            if (parameter.size() < 3 || parameter.size() > 4)
-                RuntimeError("%ls should have 3 or 4 parameters[imageWidth, imageHeight, imageChannels, [numImages=1]].", cnNodeType.c_str());
+            bool isImage = (cnNodeType == L"ImageParameter");
+            if (!isImage)
+            {
+                if (parameter.size() < 1)
+                    RuntimeError("%ls should have 1 or more parameters (tensor dimensions, e.g. [vecdim] or [rows, cols]) plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
+            }
+            else
+            {
+                if (parameter.size() < 3)
+                    RuntimeError("%ls should have 3 parameters [imageWidth, imageHeight, imageChannels] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
+            }

            if (pass == ndlPassInitial)
            {
                // evaluate only scalar parameters
                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                size_t imageWidth = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                size_t imageHeight = ((NDLNode<ElemType>*)params[1])->GetScalar();
-                size_t imageChannels = ((NDLNode<ElemType>*)params[2])->GetScalar();
-                size_t numImages = parameter.size() > 3 ? ((NDLNode<ElemType>*)params[3])->GetScalar() : 1;
-
-                nodePtr = builder.CreateSparseInputNode(name, ImageLayoutWHC(imageWidth, imageHeight, imageChannels), numImages);
-            }
-        }
-        else if (OperationNameOf(LearnableParameter) == cnNodeType)
-        {
-            if (parameter.size() < 1 || parameter.size() > 2)
-                RuntimeError("%ls should have 1 or 2 parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float]).", cnNodeType.c_str());
-
-            if (pass == ndlPassInitial)
-            {
-                // evaluate only scalar parameters
-                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
-                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
-                size_t cols = params.size() > 1 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
-
+                size_t i = 0;
+                auto tensorShape = ProcessTensorShapeParameters(node, params, i, isImage, cnNodeType);
+                if (isImage)
+                    tensorShape.AppendInPlace(3, 1);    // this goes into the column dimension
                bool needGradient = node->GetOptionalParameter("needGradient", "true");

-                nodePtr = builder.CreateLearnableParameter(name, rows, cols);
-
+                nodePtr = builder.CreateLearnableParameter(name, tensorShape);
                nodePtr->SetParameterUpdateRequired(needGradient);
            }
            else if (pass == ndlPassFinal)
@ -305,7 +289,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                nodePtr->SetParameterUpdateRequired(needGradient);
            }
        }
-        else if (cnNodeType == OperationNameOf(ReshapeNode))
+        else if (cnNodeType == L"Reshape"/*OperationNameOf(ReshapeNode)*/)
        {
            if (parameter.size() < 2 || parameter.size() > 5)
                RuntimeError("Reshape should have two to five parameters. Usage: Reshape(origNodeName, numRows, [imageWidth=], [imageHeight=], [imageChannels=].");
@ -323,18 +307,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t img_channels = node->GetOptionalParameter("imageChannels", "0");

                bool needGradient = node->GetOptionalParameter("needGradient", "false");
-                nodePtr = builder.Reshape(NULL, num_rows, ImageLayoutWHC(img_width, img_height, img_channels), name);
+                nodePtr = builder.DeprecatedReshape(NULL, num_rows, ImageDimensions::AsTensorShape(img_width, img_height, img_channels, ImageLayoutKind::HWC/*legacy*/), name);   // BUGBUG: use a tensor descriptor instead
                nodePtr->SetParameterUpdateRequired(needGradient);
            }
        }
        else if (cnNodeType == OperationNameOf(PastValueNode) || 
                 cnNodeType == OperationNameOf(FutureValueNode))
        {
-            if (parameter.size() <2 || parameter.size() >3)
-                RuntimeError("PastValue or FutureValue should have two to three fixed parameters. Usage: PastValue(rows, [cols], m, [timeStep=1, defaultPastValue=0.1]).");
+            if (parameter.size() < 2 || parameter.size() > 3)   // we allow 3 for legacy (cols parameter which is now unused)
+                RuntimeError("PastValue or FutureValue should have two to three fixed parameters. Usage: PastValue(rows, input, [timeStep=1, defaultPastValue=0.1]).");
+            // TODO: allow a tensor descriptor. Or allow 0 (inference). Maybe already supported--check this.

-            nodeParamCount = 1;
-            nodeParamStart = parameter.size() > 2?2:1;
+            nodeParamCount = 1;                             // number of inputs
+            nodeParamStart = parameter.size() > 2?2:1;      // index of input

            if (pass == ndlPassInitial)
            {
@ -342,24 +327,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
                size_t rows = ((NDLNode<ElemType>*)params[0])->GetScalar();
                // if we have three parameters the second is columns
-                size_t cols = parameter.size() > 2 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;
+                // ignore legacy size_t cols = parameter.size() > 2 ? ((NDLNode<ElemType>*)params[1])->GetScalar() : 1;

-                bool needGradient = node->GetOptionalParameter("needGradient", "false");
+                //bool needGradient = node->GetOptionalParameter("needGradient", "false");  // TODO: what's this for?
                float defaultHiddenActivity = node->GetOptionalParameter("defaultHiddenActivity", "0.1");   // TODO: parameter should be called 'defaultHiddenActivation'

-                //for backward compatibility we check timeStep first
+                // for backward compatibility we check 'timeStep' first
                size_t timeStep = node->GetOptionalParameter("timeStep", "1");
                if (timeStep == 1)
-                {
                    timeStep = node->GetOptionalParameter("delayTime", "1");
-                }

                if (cnNodeType == OperationNameOf(PastValueNode))
-                    nodePtr = builder.PastValue(NULL, defaultHiddenActivity, rows, cols, timeStep, name);
+                    nodePtr = builder.PastValue(NULL, defaultHiddenActivity, rows, timeStep, name);
                else
-                    nodePtr = builder.FutureValue(NULL, defaultHiddenActivity, rows, cols, timeStep, name);
+                    nodePtr = builder.FutureValue(NULL, defaultHiddenActivity, rows, timeStep, name);

-                nodePtr->SetParameterUpdateRequired(needGradient);    // TODO: what's this for?
+                //nodePtr->SetParameterUpdateRequired(needGradient);    // TODO: what's this for?
            }
        }    
        else if (cnNodeType == OperationNameOf(ConvolutionNode))
@ -383,16 +366,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t outputChannels = ((NDLNode<ElemType>*)params[id++])->GetScalar();
                size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
                size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-            
                assert (id == 5);

-                //optional
+                // optional
+                ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
                bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
                size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");

-
                nodePtr = builder.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
-                                              horizontalSubsample, verticalSubsample, zeroPadding, name, maxTempMemSizeInSamples);
+                                              horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding, maxTempMemSizeInSamples, name);
            }
        }
        else if (cnNodeType == OperationNameOf(MaxPoolingNode))
@ -415,11 +397,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
                size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
                size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
-            
                assert (id == 4);

+                ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
+
                nodePtr = builder.MaxPooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight,
-                                             horizontalSubsample, verticalSubsample, name);
+                    horizontalSubsample, verticalSubsample, imageLayoutKind, name);
            }
        }
        else if (cnNodeType == OperationNameOf(AveragePoolingNode))
@ -442,11 +425,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t windowHeight = ((NDLNode<ElemType>*)params[id++])->GetScalar();
                size_t horizontalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
                size_t verticalSubsample = ((NDLNode<ElemType>*)params[id++])->GetScalar();
+                assert(id == 4);

-                assert (id == 4);
+                ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));

                nodePtr = builder.AveragePooling(NULL, /*inputWidth,inputHeight, channels,*/windowWidth, windowHeight,
-                                                 horizontalSubsample, verticalSubsample, name);
+                                                 horizontalSubsample, verticalSubsample, imageLayoutKind, name);
            }
        }
        else if (cnNodeType == OperationNameOf(BatchNormalizationNode))
@ -543,6 +527,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
    }

+    // ProcessTensorShapeParameters - assume positional parameters starting from position i are tensor dimensions--parse those.
+    // Is isImage then must be a 3D tensor, which is interpreted as (W,H,C), and optional parameter 'imageLayout' says how.
+    template<class ElemType>
+    TensorShape SynchronousNodeEvaluator<ElemType>::ProcessTensorShapeParameters(const NDLNode<ElemType>* node, const vector<void*> & params, size_t & i, bool isImage, const wstring & cnNodeType/*for error messages only*/)
+    {
+        // gather dims
+        vector<size_t> dims;
+        dims.push_back(((NDLNode<ElemType>*)params[i])->GetScalar());   // first is mandatory
+        for (i++; i < params.size(); i++)
+            dims.push_back(((NDLNode<ElemType>*)params[i])->GetScalar());
+
+        // turn into tensor
+        TensorShape tensorShape(dims);
+
+        // if image then interpret as W, H, C with layout according to optional imageLayout parameter
+        if (isImage)
+        {
+            if (dims.size() != 3)
+                RuntimeError("%ls should have 3 parameters [width, height, numChannels].", cnNodeType.c_str());
+            ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
+            tensorShape = ImageDimensions::AsTensorShape(tensorShape[0], tensorShape[1], tensorShape[2], imageLayoutKind);
+        }
+
+        return tensorShape;
+    }
+
    template class SynchronousExecutionEngine<float>;
    template class SynchronousExecutionEngine<double>;

--- a/Source/CNTK/SynchronousExecutionEngine.h
+++ b/Source/CNTK/SynchronousExecutionEngine.h
@ -290,7 +290,7 @@ public:
            {
                fprintf(stderr, "'multiSeq' tag is defunct.\n");
            }
-            else if (!_strnicmp(value.c_str(), "eval", 4)) // only compare the first 4 characters
+            else if (!_strnicmp(value.c_str(), "eval", 4)) // only compare the first 4 characters. Yikes!!
            {
                SetOutputNode(m_net->EvaluationNodes(), compNode);
            }
@ -326,9 +326,10 @@ public:
        return nullptr;
    }

-    virtual ~SynchronousNodeEvaluator()
-    {
-    }
+    virtual ~SynchronousNodeEvaluator() { }
+
+protected:
+    TensorShape ProcessTensorShapeParameters(const NDLNode<ElemType>* node, const vector<void*> & params, size_t & i, bool isImage, const wstring & cnNodeType/*for error messages only*/);

 private:
    ComputationNetworkPtr m_net;
--- a/Source/Common/Include/ScriptableObjects.h
+++ b/Source/Common/Include/ScriptableObjects.h
@ -489,7 +489,7 @@ namespace Microsoft { namespace MSR { namespace ScriptableObjects {
            std::vector<C> res;
            res.reserve(GetSize(Fail));
            for (const auto & val : values)
-                res.push_back(val);
+                res.push_back(val.ResolveValue());  // resolve upon access
            return res;
        }
    };
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@ -196,7 +196,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                LogicError("AddSequence: Sequence added to an MBLayout must overlap with minibatch.");

            // remember it
-#ifdef _DEBUG
+#if 0//def _DEBUG
            auto cap = m_sequences.capacity();  // Some sanity check for debugging a speed regression. This should only show up during the first minibatches, and growing only.
            m_sequences.push_back(seqDesc);
            if (cap != m_sequences.capacity())
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@ -1,6 +1,6 @@
-// DataTensor.h -- tensor descriptor that describes the inner structure of data vectors
+// TensorShape.h -- tensor descriptor that describes the inner structure of data vectors
 //
-// <copyright file="Sequences.h" company="Microsoft">
+// <copyright file="TensorShape.h" company="Microsoft">
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
 // </copyright>
 //
@ -90,6 +90,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        T m_data[12];
        size_t m_size;
+#ifdef _DEBUG
+        void DebugWipe() { memset(m_data, 0, sizeof(m_data)); } // initialize to 0 to make it look prettier in the debugger
+#else
+        void DebugWipe() { }
+#endif
    public:
        size_t capacity() const { return _countof(m_data); }
        size_t size() const { return m_size; }
@ -103,12 +108,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        template<class ITER>
        void assign(ITER beg, const ITER & end) { clear(); append(beg,end); }
        void operator=(const SmallVector & other) { m_size = other.m_size; memcpy(m_data, other.m_data, other.m_size * sizeof(T)); }
-        SmallVector(const SmallVector & other) { *this = other; }
-        SmallVector(size_t sz, const T & val) { assign(sz, val); }
+        SmallVector(const SmallVector & other) { DebugWipe(); *this = other; }
+        SmallVector(size_t sz, const T & val) { DebugWipe(); assign(sz, val); }
        SmallVector(size_t sz) : SmallVector(sz, 0) { }
        SmallVector() : SmallVector(0) { }
-        SmallVector(const std::vector<T>           & v) { assign(v.begin(), v.end()); }
-        SmallVector(const std::initializer_list<T> & l) { assign(l.begin(), l.end()); }
+        SmallVector(const std::vector<T>           & v) { DebugWipe(); assign(v.begin(), v.end()); }
+        SmallVector(const std::initializer_list<T> & l) { DebugWipe(); assign(l.begin(), l.end()); }
        bool operator==(const SmallVector & other) const { return size() == other.size() && !memcmp(data(), other.data(), other.m_size * sizeof(T)); }
        bool operator!=(const SmallVector & other) const { return !operator==(other); } // duh
        T   operator[](size_t i) const { if (i >= size()) LogicError("SmallVector: index overflow"); return m_data[i]; }
@ -203,28 +208,28 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
        }

-        void Load(File& fstream)
+        void Load(File& fstream, bool acceptLegacyFormat = false)
        {
            // format: uint32_t n, dim[0], dim[1], ..., dim[n-1]
            // We are also able to read (but not write) an older format, which stores 3-dimensional tensors as size_t W, H, C
-            uint32_t n, dim;
-            fstream >> n >> dim;
-            if (dim)        // heuristic to detect the old format. Old format stores a size_t, i.e. the second uint32_t is 0 (no dimensions are > 4G)
+            uint32_t rank, dim0;
+            fstream >> rank >> dim0;
+            if (!acceptLegacyFormat || dim0 != 0)        // heuristic to detect the old format. Old format stores a size_t, i.e. the second uint32_t is 0 (no dimensions are > 4G)
            {
-                m_dims.resize(n);
-                m_dims[0] = dim;
-                for (size_t i = 1; i < n; i++)
+                m_dims.resize(rank);
+                m_dims[0] = dim0;
+                for (size_t i = 1; i < rank; i++)
                {
-                    fstream >> dim;
-                    m_dims[i] = dim;
+                    fstream >> dim0;
+                    m_dims[i] = dim0;
                }
-                assert(n == m_dims.size());
+                assert(rank == m_dims.size());
            }
            else            // detected the old size_t W, H, C format
            {
-                m_dims.resize(3);     // current format is hard-coded for 3, for back compat
-                m_dims[1] = n;
-                fstream >> m_dims[2] >> m_dims[0]; // currently stored in order W, H, C. TODO: general tensor format will be different
+                m_dims.resize(3);
+                m_dims[1] = rank;
+                fstream >> m_dims[2] >> m_dims[0]; // stored in order C, W, H
            }
            InitAsNoSlice();
        }
@ -243,13 +248,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        const SmallVector<size_t> & GetDims() const { return m_dims; }    // get all, e.g. for logging or for constructing derived tensors with edited dimensions
        const SmallVector<ptrdiff_t> & GetStrides() const { return m_strides; }

-        // interpretation as an image tensor
-        size_t GetNumChannels() const { if (m_dims.empty()) return 0; else return m_dims.size() > 0 ? m_dims[0] : 1; }
-        size_t GetWidth()       const { if (m_dims.empty()) return 0; else return m_dims.size() > 1 ? m_dims[1] : 1; }
-        size_t GetHeight()      const { if (m_dims.empty()) return 0; else return m_dims.size() > 2 ? m_dims[2] : 1; }
-        // heuristics used for pretty-printing
-        // TODO: This will go away.
-        bool IsInputAnImage() const { return GetRank() == 3 && (GetWidth() != 1 || GetNumChannels() != 1); }
+        // legacy helper function for RowSliceNode. Will go away.
        bool IsVectorStoredAsImage() const { return GetRank() == 3 && m_dims[0] == 1 && m_dims[1] == 1; }

        // indexing
@ -316,8 +315,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    //   m_dims    =   I   1    J   K
                    //   m_strides =   1   I    I   I*J
                    // dropping the second dimension
-                    //   m_dims    =   I   %    J   K
-                    //   m_strides =   1   %    I   I*J
+                    //   m_dims    =   I        J   K
+                    //   m_strides =   1        I   I*J
                    m_dims[j]    = m_dims[k];
                    m_strides[j] = m_strides[k];
                    j++;
@ -442,15 +441,61 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // TODO: Does the same trick work for 2D images?
    };

-    // When constructing an image tensor with the usual W, H, C format, use the following function instead.
-    // This will sort the three parameters into the correct order.
-    // BUGBUG: at several places, a comment says "after multiplication the structure is lost" and the vector dimension
-    //         is set as the image height. However, the image height is actually the wrong dimension since images are assumed transposed.
-    //         This will get fixed once we get more complete arbitrary tensor support throughout, including better-defined inference rules.
-    static inline TensorShape ImageLayoutWHC(size_t width, size_t height, size_t channels)
+    // image layouts used in CNTK
+    // Nodes that do semantic interpretation of width, height, channel information must know which index they are in.
+    // Eventually this can go away once we switch completely to cudnn layout.
+    // The cudnn layout is actually our layout in order W,H,C.
+    enum ImageLayoutKind
    {
-        return TensorShape(channels, width, height);
+        HWC,    // legacy; default for NDL
+        CHW     // cudnn; default for BrainScript
+    };
+    static inline std::string ToString(ImageLayoutKind imageLayoutKind)
+    {
+        if       (imageLayoutKind == ImageLayoutKind::CHW) return "CHW";
+        else  if (imageLayoutKind == ImageLayoutKind::HWC) return "HWC";
+        else LogicError("ImageLayout: Invalid ImageLayoutKind");
    }
-    // TODO: we need a constructor from config; that will allow us to generalize
+    static inline ImageLayoutKind ImageLayoutKindFrom(const wstring & s)
+    {
+        if      (s == L"CHW" || s == L"cudnn")  return ImageLayoutKind::CHW;
+        else if (s == L"HWC" || s == L"legacy") return ImageLayoutKind::HWC;
+        else InvalidArgument("ImageLayoutKindFrom: Unknown ImageLayoutKind '%ls', must be 'CHW' (cudnn) or 'HWC' (CNTK legacy)", s.c_str());
+    }
+
+    // interpret TensorShape as an image descriptor
+    // considering that we support two ways of storingimages
+    struct ImageDimensions
+    {
+        size_t m_width, m_height, m_numChannels;
+        // interpret TensorShape as image
+        ImageDimensions(const TensorShape & shape, ImageLayoutKind imageLayoutKind)
+        {
+            if (shape.GetRank() != 3)
+                InvalidArgument("Convolution operation currently only supports 1D or 2D convolution on 3D tensors.");
+            if (imageLayoutKind == ImageLayoutKind::CHW)
+            {
+                m_width       = shape[0];
+                m_height      = shape[1];
+                m_numChannels = shape[2];
+            }
+            else  if (imageLayoutKind == ImageLayoutKind::HWC)
+            {
+                m_width       = shape[1];
+                m_height      = shape[2];
+                m_numChannels = shape[0];
+            }
+            else LogicError("WHC: Invalid ImageLayoutKind");
+        }
+        ImageDimensions(size_t width, size_t height, size_t numChannels) : m_width(width), m_height(height), m_numChannels(numChannels) {}
+        // intepret image as TensorShape
+        static TensorShape AsTensorShape(size_t width, size_t height, size_t numChannels, ImageLayoutKind imageLayoutKind/* = ImageLayoutKind::HWC*/)
+        {
+            if       (imageLayoutKind == ImageLayoutKind::CHW) return TensorShape(width, height, numChannels);
+            else  if (imageLayoutKind == ImageLayoutKind::HWC) return TensorShape(numChannels, width, height);
+            else LogicError("ImageLayout: Invalid ImageLayoutKind");
+        }
+        TensorShape AsTensorShape(ImageLayoutKind imageLayoutKind) { return AsTensorShape(m_width, m_height, m_numChannels, imageLayoutKind); }
+    };

 }}}
--- a/Source/ComputationNetworkLib/CompositeComputationNodes.h
+++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h
@ -251,7 +251,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Base::Load(fstream, modelVersion);
            fstream >> m_hasComputed;
            LoadValue(fstream);
-         }
+            // Note: This loses the sample layout, but that is recovered by Validate().
+        }

        virtual void DumpNodeInfo(const bool printValues, File& fstream) const override
        {
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -654,8 +654,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            for (auto nodeIter = convolutionNodes.begin(); nodeIter != convolutionNodes.end(); nodeIter++)
            {
-                auto node = dynamic_pointer_cast<ConvolutionNode<float>>(*nodeIter);
-                node->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
+                auto nodef = dynamic_pointer_cast<ConvolutionNode<float>>(*nodeIter);
+                if (nodef)
+                    nodef->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
+                auto noded = dynamic_pointer_cast<ConvolutionNode<double>>(*nodeIter);
+                if (noded)
+                    noded->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
            }
        }
    }
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -35,7 +35,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // please keep this table sorted
        if      (nodeType == OperationNameOf(CRFNode))	                            return New<CRFNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode))return New<ClassBasedCrossEntropyWithSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
-#if 0// change once we no longer see a perf hit to #ifdef ENABLE_TENSORVIEW
+#ifdef ENABLE_BROADCASTING_ELEMENTTIMES
        else if (nodeType == L"ColumnElementTimes")                                 return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
 #else
        else if (nodeType == OperationNameOf(ColumnElementTimesNode))               return New<ColumnElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
@ -76,7 +76,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        else if (nodeType == OperationNameOf(ReconcileMBLayoutNode))	            return New<ReconcileMBLayoutNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(RectifiedLinearNode))	            return New<RectifiedLinearNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(ReshapeNode))	                    return New<ReshapeNode<ElemType>>(forward<_Types>(_Args)...);
-#if 0// change once we no longer see a perf hit to #ifdef ENABLE_TENSORVIEW
+#ifdef ENABLE_BROADCASTING_ELEMENTTIMES
        else if (nodeType == L"RowElementTimes")	                            return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
 #else
        else if (nodeType == OperationNameOf(RowElementTimesNode))	            return New<RowElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
@ -85,7 +85,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        else if (nodeType == OperationNameOf(DiagonalNode))	                    return New<DiagonalNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(RowSliceNode))	                    return New<RowSliceNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(RowStackNode))	                    return New<RowStackNode<ElemType>>(forward<_Types>(_Args)...);
-#if 0// change once we no longer see a perf hit to #ifdef ENABLE_TENSORVIEW
+#ifdef ENABLE_BROADCASTING_ELEMENTTIMES
        else if (nodeType == L"Scale")	                                            return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
 #else
        else if (nodeType == OperationNameOf(ScaleNode))	                    return New<ScaleNode<ElemType>>(forward<_Types>(_Args)...);
@ -107,6 +107,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        else if (nodeType == L"Delay")                                              return New<PastValueNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == L"PerDimMeanVarNormalizationNode")	                    return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == L"PerDimMeanVarNormalizationNode")	                    return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
+#if 1
+        else if (nodeType == OperationNameOf(DeprecatedReshapeNode))	            return New<DeprecatedReshapeNode<ElemType>>(forward<_Types>(_Args)...);
+#endif
        else InvalidArgument("Attempted to instantiate undefined operation %ls.", nodeType.c_str());
    }

@ -116,14 +119,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    static shared_ptr<ComputationNode<ElemType>> CreateNode(const std::wstring & nodeType, _Types&&... _Args)
    {
        // check more types
-        if      (nodeType == OperationNameOf(AveragePoolingNode)) return New<AveragePoolingNode<ElemType>>(forward<_Types>(_Args)...);
-        else if (nodeType == OperationNameOf(ConvolutionNode))	  return New<ConvolutionNode<ElemType>>(forward<_Types>(_Args)...);
-        else if (nodeType == OperationNameOf(SparseInputValue))	  return New<SparseInputValue<ElemType>>(forward<_Types>(_Args)...);
-        else if (nodeType == OperationNameOf(InputValue))	  return New<InputValue<ElemType>>(forward<_Types>(_Args)...);
-        else if (nodeType == OperationNameOf(LearnableParameter)) return New<LearnableParameter<ElemType>>(forward<_Types>(_Args)...);
-        else if (nodeType == OperationNameOf(MaxPoolingNode))	  return New<MaxPoolingNode<ElemType>>(forward<_Types>(_Args)...);
+        if      (nodeType == OperationNameOf(AveragePoolingNode))     return New<AveragePoolingNode<ElemType>>(forward<_Types>(_Args)...);
+        else if (nodeType == OperationNameOf(BatchNormalizationNode)) return New<BatchNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
+        else if (nodeType == OperationNameOf(ConvolutionNode))	      return New<ConvolutionNode<ElemType>>(forward<_Types>(_Args)...);
+        else if (nodeType == OperationNameOf(SparseInputValue))	      return New<SparseInputValue<ElemType>>(forward<_Types>(_Args)...);
+        else if (nodeType == OperationNameOf(InputValue))	      return New<InputValue<ElemType>>(forward<_Types>(_Args)...);
+        else if (nodeType == OperationNameOf(LearnableParameter))     return New<LearnableParameter<ElemType>>(forward<_Types>(_Args)...);
+        else if (nodeType == OperationNameOf(MaxPoolingNode))	      return New<MaxPoolingNode<ElemType>>(forward<_Types>(_Args)...);
        //else if (nodeType == OperationNameOf(SparseLearnableParameter)) return New<SparseLearnableParameter<ElemType>>(forward<_Types>(_Args)...);
-        else if (nodeType == OperationNameOf(BatchNormalizationNode))   return New<BatchNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
        else return CreateStandardNode<ElemType>(nodeType, forward<_Types>(_Args)...);
    }

@ -175,6 +178,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, rows, cols));
    }

+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateLearnableParameter(const std::wstring & paramName, const TensorShape & tensorShape)
+    {
+        return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, tensorShape));
+    }
+
 #if 0   // not functional at present
    //sparse matrix size is optionally specified
    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size)
@ -183,28 +191,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }
 #endif

-    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName, const size_t rows, const size_t cols)
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName, const size_t rows)
    {
-        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, rows, cols));
+        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, rows));
    }

-    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName, const size_t rows, const size_t cols)
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName, const size_t rows)
    {
-        return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, rows, cols));
+        return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, rows));
    }

-    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName,
-                                                                                                                        const TensorShape & imageLayout,
-                                                                                                                        const size_t numImages)
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring & inputName, const TensorShape & sampleLayout)
    {
-        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout, numImages));
+        return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, sampleLayout));
    }

-    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName,
-                                                                                                                              const TensorShape & imageLayout,
-                                                                                                                              const size_t numImages)
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring & inputName, const TensorShape & imageLayout)
    {
-        return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout, numImages));
+        return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout));
    }

    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreatePairNetworkNode(const std::wstring & inputName, const size_t rows, const size_t cols)
@ -215,37 +219,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateConvolutionNode(const std::wstring & nodeName,
                                                                            const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
                                                                            const size_t horizontalSubsample, const size_t verticalSubsample,
-                                                                            const bool zeroPadding,
+                                                                            ImageLayoutKind imageLayoutKind, const bool zeroPadding,
                                                                            const size_t maxTempMemSizeInSamples)
    {
        return net.AddNodeToNetWithElemType(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
-            kernelWidth, kernelHeight,
-            outputChannels,
-            horizontalSubsample,
-            verticalSubsample, zeroPadding,
-            maxTempMemSizeInSamples));
+                                                                           kernelWidth, kernelHeight, outputChannels,
+                                                                           horizontalSubsample, verticalSubsample, imageLayoutKind,
+                                                                           zeroPadding,
+                                                                           maxTempMemSizeInSamples));
    }

    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateMaxPoolingNode(const std::wstring & nodeName,
-        const size_t windowWidth,
-        const size_t windowHeight,
-        const size_t horizontalSubsample,
-        const size_t verticalSubsample)
+        const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind)
    {
-        return net.AddNodeToNetWithElemType(New<MaxPoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
-            windowWidth, windowHeight,
-            horizontalSubsample,
-            verticalSubsample));
+        return net.AddNodeToNetWithElemType(New<MaxPoolingNode<ElemType>>(net.GetDeviceId(), nodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind));
    }

-    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateAveragePoolingNode(const std::wstring & nodeName, const size_t windowWidth,
-        const size_t windowHeight, const size_t horizontalSubsample,
-        const size_t verticalSubsample)
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateAveragePoolingNode(const std::wstring & nodeName,
+        const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind)
    {
-        return net.AddNodeToNetWithElemType(New<AveragePoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
-            windowWidth, windowHeight,
-            horizontalSubsample,
-            verticalSubsample));
+        return net.AddNodeToNetWithElemType(New<AveragePoolingNode<ElemType>>(net.GetDeviceId(), nodeName, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind));
    }

    // this is the catch-all for all cases not covered as special cases above
@ -274,49 +267,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convolution(const ComputationNodePtr weight,
        const ComputationNodePtr inputValues,
-        const size_t kernelWidth,
-        const size_t kernelHeight,
-        const size_t outputChannels,
-        const size_t horizontalSubsample,
-        const size_t verticalSubsample,
-        const bool zeroPadding,
-        const std::wstring nodeName,
-        const size_t maxTempMemSizeInSamples)
+        const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding, const size_t maxTempMemSizeInSamples,
+        const std::wstring nodeName)
    {
        return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
-            kernelWidth, kernelHeight,
-            outputChannels,
-            horizontalSubsample,
-            verticalSubsample, zeroPadding,
+            kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding,
            maxTempMemSizeInSamples),
            weight, inputValues);
    }

    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MaxPooling(const ComputationNodePtr inputValues,
-        const size_t windowWidth,
-        const size_t windowHeight,
-        const size_t horizontalSubsample,
-        const size_t verticalSubsample,
+        const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
        const std::wstring nodeName)
    {
        return net.AddNodeToNetAndAttachInputs(New<MaxPoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
-            windowWidth, windowHeight,
-            horizontalSubsample,
-            verticalSubsample),
+            windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind),
            inputValues);
    }

    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::AveragePooling(const ComputationNodePtr inputValues,
-        const size_t windowWidth,
-        const size_t windowHeight,
-        const size_t horizontalSubsample,
-        const size_t verticalSubsample,
+        const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
        const std::wstring nodeName)
    {
        return net.AddNodeToNetAndAttachInputs(New<AveragePoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
-            windowWidth, windowHeight,
-            horizontalSubsample,
-            verticalSubsample),
+            windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind),
            inputValues);
    }

@ -486,7 +460,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return net.AddNodeToNetAndAttachInputs(New<SumElementsNode<ElemType>>(net.GetDeviceId(), nodeName), a);
    }

-#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
+#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Scale(const ComputationNodePtr scalar, const ComputationNodePtr matrix, const std::wstring nodeName)
    {
        return net.AddNodeToNetAndAttachInputs(New<ScaleNode<ElemType>>(net.GetDeviceId(), nodeName), scalar, matrix);
@ -513,7 +487,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return net.AddNodeToNetAndAttachInputs(New<ElementTimesNode<ElemType>>(net.GetDeviceId(), nodeName), a, b);
    }

-#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
+#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
    {
        return net.AddNodeToNetAndAttachInputs(New<RowElementTimesNode<ElemType>>(net.GetDeviceId(), nodeName), a, b);
@ -561,12 +535,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Reshape(const ComputationNodePtr a,
-                                                                                                                const size_t numRows,
                                                                                                                const TensorShape & imageLayout,
                                                                                                                const std::wstring nodeName)
    {
-        return net.AddNodeToNetAndAttachInputs(New<ReshapeNode<ElemType>>(net.GetDeviceId(), nodeName, numRows, imageLayout), a);
+        return net.AddNodeToNetAndAttachInputs(New<ReshapeNode<ElemType>>(net.GetDeviceId(), nodeName, imageLayout), a);
    }
+#if 1
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::DeprecatedReshape(const ComputationNodePtr a,
+                                                                                                                          const size_t numRows,
+                                                                                                                          const TensorShape & imageLayout,
+                                                                                                                          const std::wstring nodeName)
+    {
+        return net.AddNodeToNetAndAttachInputs(New<DeprecatedReshapeNode<ElemType>>(net.GetDeviceId(), nodeName, numRows, imageLayout), a);
+    }
+#endif

    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::RowRepeat(const ComputationNodePtr a, const size_t num_repeat, const std::wstring nodeName)
    {
@ -578,14 +560,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return net.AddNodeToNetAndAttachInputs(New<DiagonalNode<ElemType>>(net.GetDeviceId(), nodeName), a);
    }

-    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName)
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName)
    {
-        return net.AddNodeToNetAndAttachInputs(New<PastValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, col_size, timeStep), a);
+        return net.AddNodeToNetAndAttachInputs(New<PastValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, timeStep), a);
    }

-    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName)
+    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName)
    {
-        return net.AddNodeToNetAndAttachInputs(New<FutureValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, col_size, timeStep), a);
+        return net.AddNodeToNetAndAttachInputs(New<FutureValueNode<ElemType>>(net.GetDeviceId(), nodeName, initHiddenActivity, row_size, timeStep), a);
    }

    template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Parallel(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -9,6 +9,7 @@
 #include "ComputationNetwork.h"
 #include "TrainingCriterionNodes.h" // for NCEEvalMode
 #include "ScriptableObjects.h"
+#include "TensorShape.h"
 #include <string>

 namespace Microsoft { namespace MSR { namespace CNTK {
@ -39,47 +40,34 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // TODO: separate into nodes that have inputs and those that duplicate functions with input adding except just not adding inputs. Clear?

        ComputationNodePtr CreateLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols);
+        ComputationNodePtr CreateLearnableParameter(const std::wstring & paramName, const TensorShape & tensorShape);
        //sparse matrix size is optionally specified
        //ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0);
-        ComputationNodePtr CreateInputNode(const std::wstring & inputName, const size_t rows, const size_t cols);
-        ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const size_t rows, const size_t cols);
-        ComputationNodePtr CreateInputNode(const std::wstring & inputName, const TensorShape & imageLayout, const size_t numImages);
-        ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const TensorShape & imageLayout, const size_t numImages);
+        ComputationNodePtr CreateInputNode(const std::wstring & inputName, const size_t rows);
+        ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const size_t rows);
+        ComputationNodePtr CreateInputNode(const std::wstring & inputName, const TensorShape & sampleLayout);
+        ComputationNodePtr CreateSparseInputNode(const std::wstring & inputName, const TensorShape & sampleLayout);
        ComputationNodePtr CreatePairNetworkNode(const std::wstring & inputName, const size_t rows, const size_t cols);
-        ComputationNodePtr CreateConvolutionNode(const std::wstring & nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
-        ComputationNodePtr CreateMaxPoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
-        ComputationNodePtr CreateAveragePoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
+        ComputationNodePtr CreateConvolutionNode(const std::wstring & nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
+        ComputationNodePtr CreateMaxPoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
+        ComputationNodePtr CreateAveragePoolingNode(const std::wstring & nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
        // this is the catch-all for all cases not covered as special cases above
        // Unlike the specialized ones above, this one creates nodes by type given as a string.
        ComputationNodePtr CreateComputationNode(const std::wstring & nodeType, const std::wstring & nodeName);
-        // TODO: These next three functions are wrappers around CreateXXXNode(). Remove these.
-        ComputationNodePtr Parameter(const size_t rows, size_t cols, const std::wstring nodeName = L"") { return CreateLearnableParameter(nodeName, rows, cols); } // TODO: remove
-        ComputationNodePtr Input(const size_t rows, const size_t cols, const std::wstring nodeName = L"") { return CreateInputNode(nodeName, rows, cols); } // TODO: remove
-        ComputationNodePtr Input(const TensorShape & imageLayout, const size_t numImages, const std::wstring nodeName = L"") { return CreateInputNode(nodeName, imageLayout, numImages); } // TODO: remove
        // The following functions create nodes and link them to the network and their inputs.
        // TODO: Do we need both this set and the one above that does not add inputs? Can they share more code?
        ComputationNodePtr PairNetwork(const ComputationNodePtr & a, const std::wstring nodeName = L"");
        ComputationNodePtr Convolution(const ComputationNodePtr weight,
                                       const ComputationNodePtr inputValues,
-                                       const size_t kernelWidth,
-                                       const size_t kernelHeight,
-                                       const size_t outputChannels,
-                                       const size_t horizontalSubsample,
-                                       const size_t verticalSubsample,
-                                       const bool zeroPadding = false,
-                                       const std::wstring nodeName = L"",
-                                       const size_t maxTempMemSizeInSamples = 0);
+                                       const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
+                                       const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
+                                       const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0,
+                                       const std::wstring nodeName = L"");
        ComputationNodePtr MaxPooling(const ComputationNodePtr inputValues,
-                                      const size_t windowWidth,
-                                      const size_t windowHeight,
-                                      const size_t horizontalSubsample,
-                                      const size_t verticalSubsample,
+                                      const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
                                      const std::wstring nodeName = L"");
        ComputationNodePtr AveragePooling(const ComputationNodePtr inputValues,
-                                          const size_t windowWidth,
-                                          const size_t windowHeight,
-                                          const size_t horizontalSubsample,
-                                          const size_t verticalSubsample,
+                                          const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
                                          const std::wstring nodeName = L"");
        ComputationNodePtr ErrorPrediction(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
        ComputationNodePtr PerDimMeanVarNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");
@ -111,14 +99,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ComputationNodePtr Hardmax(const ComputationNodePtr a, const std::wstring nodeName = L"");
        ComputationNodePtr LogSoftmax(const ComputationNodePtr a, const std::wstring nodeName = L"");
        ComputationNodePtr Sum(const ComputationNodePtr a, const std::wstring nodeName = L"");
-#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
+#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
        ComputationNodePtr Scale(const ComputationNodePtr scalar, const ComputationNodePtr matrix, const std::wstring nodeName = L"");
 #endif
        ComputationNodePtr Transpose(const ComputationNodePtr matrix, const std::wstring nodeName = L"");
        ComputationNodePtr Times(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
        ComputationNodePtr TransposeTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
        ComputationNodePtr ElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
-#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
+#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
        ComputationNodePtr RowElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
        ComputationNodePtr ColumnElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
 #endif
@ -129,11 +117,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ComputationNodePtr Plus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
        ComputationNodePtr Minus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
        ComputationNodePtr Dropout(const ComputationNodePtr a, const std::wstring nodeName = L"");
-        ComputationNodePtr Reshape(const ComputationNodePtr a, const size_t num_rows, const TensorShape & imageLayout, const std::wstring nodeName = L"");
+        ComputationNodePtr Reshape(const ComputationNodePtr a, const TensorShape & imageLayout, const std::wstring nodeName = L"");
+#if 1   // legacy
+        ComputationNodePtr DeprecatedReshape(const ComputationNodePtr a, const size_t num_rows, const TensorShape & imageLayout, const std::wstring nodeName = L"");
+#endif
        ComputationNodePtr RowRepeat(const ComputationNodePtr a, const size_t num_repeat, const std::wstring nodeName = L"");
        ComputationNodePtr Diagonal(const ComputationNodePtr a, const std::wstring nodeName = L"");
-        ComputationNodePtr PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName = L"");
-        ComputationNodePtr FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, const size_t col_size, size_t timeStep, const std::wstring nodeName = L"");
+        ComputationNodePtr PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
+        ComputationNodePtr FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
        ComputationNodePtr Parallel(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
        ComputationNodePtr RowSlice(const ComputationNodePtr a, const size_t start_index, const size_t num_rows, const std::wstring nodeName = L"");
        ComputationNodePtr RowStack(const std::vector<ComputationNodePtr> pinputs, const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -649,7 +649,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                // We do call validate(final) as many times as needed, since stuff may have changed underneath.
                node->PrintSelfBeforeValidation();
                node->Validate(isFinalValidationPass/*final*/);      // all nodes have been visited: do verification instead of just inference
-                fprintf(stderr, " -> [%lu, %s%lu]", node->GetNumRows(), node->HasMBLayout() ? "MBSize " : "", node->GetNumCols());
+                fprintf(stderr, " -> [%lu [%s], %s%lu]", node->GetNumRows(), string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? "MBSize " : "", node->GetNumCols());
                node->m_visited = true;
                // also take the opportunity to propagate m_needsGradient
                auto needsGradient = node->m_needsGradient;
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
@ -155,7 +155,7 @@
    <ClInclude Include="..\Common\Include\Basics.h" />
    <ClInclude Include="..\Common\Include\BestGpu.h" />
    <ClInclude Include="..\Common\Include\Config.h" />
-    <ClInclude Include="..\Common\Include\DataTensor.h" />
+    <ClInclude Include="..\Common\Include\TensorShape.h" />
    <ClInclude Include="..\Common\Include\File.h" />
    <ClInclude Include="..\Common\Include\fileutil.h" />
    <ClInclude Include="..\Common\Include\Platform.h" />
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
@ -117,7 +117,7 @@
    <ClInclude Include="EsotericNodes.h">
      <Filter>Nodes</Filter>
    </ClInclude>
-    <ClInclude Include="..\Common\Include\DataTensor.h">
+    <ClInclude Include="..\Common\Include\TensorShape.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
    <ClInclude Include="..\Common\Include\Config.h">
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -9,7 +9,7 @@
 #include "ComputationNode.h"
 #include "InputAndParamNodes.h"
 #include "ComputationNetworkBuilder.h"  // TODO: We should only pull in NewComputationNodeFromConfig(). Nodes should not know about network at large.
-#include "DataTensor.h"
+#include "TensorShape.h"

 #ifndef let
 #define let const auto
@ -72,6 +72,7 @@ namespace Microsoft {
        size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
        size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols();

+#if 1//ndef ENABLE_TENSORVIEW
        // TODO: This test will go away once we switch to full tensor lib.
        if (isFinalValidationPass && !(
               (rows0 == rows1 && (Input(0)->GetMBLayout() == Input(1)->GetMBLayout() || cols0 == cols1)) ||                                  // matching size (obvious case)
@ -81,6 +82,9 @@ namespace Microsoft {
        {
            LogicError("The Matrix dimensions in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
        }
+#else
+        rows0; rows1;
+#endif

        // result has tensor shape with dimensions being the max over both
        let shape0 = GetInputSampleLayout(0);
@ -98,7 +102,7 @@ namespace Microsoft {
                dims[k] = dim1;         // then use dimension we broadcast to
            else if (dim1 == 1)         // if [1] is broadcasting
                ;                       // dims is already correct
-            else if (dim1 != dims[k])   // no broadcasting: they must match
+            else if (isFinalValidationPass && dim1 != dims[k])   // no broadcasting: they must match
                InvalidArgument("%ls %ls operation: Input dimensions [%s] and [%s] are not compatible.",
                                NodeName().c_str(), OperationName().c_str(), string(shape0).c_str(), string(shape1).c_str());
        }
@ -181,9 +185,6 @@ namespace Microsoft {
            if (m_sampleLayout.GetDim(k) == 0 || m_sampleLayout.GetDim(k) == SIZE_MAX)
                layoutPlausible = false;
        }
-        // some code initializes it to (1,1,rowDim)
-        if (m_sampleLayout.GetRank() == 3 && m_sampleLayout.GetDim(0) == 1 && m_sampleLayout.GetDim(1) == 1)
-            layoutPlausible = false;
        // check dimension
        if (GetNumRows() != m_sampleLayout.GetNumElements())
            layoutPlausible = false;
@ -204,6 +205,8 @@ namespace Microsoft {
        for (size_t i = 0; i < GetNumInputs(); i++)
        {
            size_t rank = Input(i)->GetAndValidateSampleLayout().GetRank();
+            if (!HasMBLayout())                         // no MBLayout: last dim is column dimension
+                rank++;
            if (maxRank < rank)
                maxRank = rank;
        }
@ -215,8 +218,9 @@ namespace Microsoft {
    TensorShape ComputationNodeBase::GetTensorShape(size_t rank, const FrameRange & fr) const
    {
        //GetAndValidateSampleLayout();     // no need to validate because rank comes from DetermineElementwiseTensorRank() which validates all
-        if (!HasMBLayout())                         // no MBLayout: just return sample layout (if other participants have layout, tensor lib will broadcast)
-            return GetSampleLayout();    //  .Pad(rank); // no need for padding
+        if (!HasMBLayout())
+            return GetSampleLayout().Append(GetSampleLayout().GetRank(), GetNumCols());    //  last dim is column dimension
+            // TODO: This is not nice! Instead, of no MBLayout then have sample layout explain whole matrix.
        else if (fr.IsAllFrames())
        {
            // we have an MBLayout, and for refers to the entire MB
@ -301,6 +305,7 @@ namespace Microsoft { namespace MSR { namespace ScriptableObjects {
        static TensorShape TensorShapeFromConfig(const IConfigRecord & config)
        {
            const auto & valp = config[L"dims"];
+            // TODO: Add code that if input is already a tensor shape it is also OK.
            if (valp.Is<ConfigArray>())
                return TensorShape(valp.AsRef<ConfigArray>().AsVector<size_t>([&](const wstring & msg){ valp.Fail(msg); }));
            else
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -10,7 +10,7 @@
 #include "TensorView.h"
 #include "ScriptableObjects.h"
 #include "Sequences.h"
-#include "DataTensor.h"
+#include "TensorShape.h"
 #include "MatrixPool.h"

 #include <unordered_set>
@ -26,7 +26,9 @@
 #include <sstream>
 #include <iostream>

-// #define ENABLE_TENSORVIEW   // flip this switch once the tensor lib is confirmed to be working
+// remove these following two #defines once the tensor lib works
+#define ENABLE_TENSORVIEW   // if set then tensor lib is used instead of old Matrix implementations, wherever such an implementation exists
+#define ENABLE_BROADCASTING_ELEMENTTIMES    // if set then ScaleNode and Row/ColumnElementTimes are redirected to ElementTimes

 #define DEFAULT_HIDDEN_ACTIVATION 0.1

@ -307,6 +309,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //  - PairNetworkNode
        //  - LSTMNode
        // set our dimensions (rows, cols, sample layout)
+        // TODO: Separate SetDims() into version with and without MBLayout.
        void SetDims(const TensorShape & sampleLayout, size_t cols)
        {
            m_sampleLayout = sampleLayout;
@ -501,9 +504,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    }

                    const char * mbSizeMark = child->m_pMBLayout ? "MBSize " : "";
-                    if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout.GetWidth() != 1 || child->m_sampleLayout.GetNumChannels() != 1))  // looks like an image: use WHC notation
-                        fprintf(stderr, "%ls[%lu {W=%lu, H=%lu, C=%lu}, %s%lu]", child->NodeName().c_str(), child->GetNumRows(),
-                                child->m_sampleLayout.GetWidth(), child->m_sampleLayout.GetHeight(), child->m_sampleLayout.GetNumChannels(), mbSizeMark, child->GetNumCols());
+                    if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1))  // looks like an image: use WHC notation
+                        fprintf(stderr, "%ls[%lu [%s] {W=%lu, H=%lu, C=%lu}, %s%lu]", child->NodeName().c_str(), child->GetNumRows(), string(child->m_sampleLayout).c_str(),
+                                child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0], mbSizeMark, child->GetNumCols());
+                    //BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct.
                    else if (child->m_sampleLayout.GetRank() > 1)           // tensor: output the tensor dimensions   --TODO: there will be no numRows in the future, only the tensor
                        fprintf(stderr, "%ls[%lu [%s], %s%lu]", child->NodeName().c_str(), child->GetNumRows(), string(child->m_sampleLayout).c_str(), mbSizeMark, child->GetNumCols());
                    else
@ -538,14 +542,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop;
        }

-        // TODO: Remove this.
-        // used from:
-        //  - Plus/Minus/ElementTimesNode --> replace by max dim over inputs. Make this standard behavior for all binary element-wise ops.
-        bool IsInputAnImage(const size_t index) const
-        {
-            return m_inputs[index]->m_sampleLayout.IsInputAnImage();
-        }
-
        const size_t GetNumInputs() const { return m_inputs.size(); }

        virtual void SetInput(const size_t childIndex, const ComputationNodeBasePtr& node) = 0;
@ -825,7 +821,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            fstream >> Value();
            // above reads dimensions, so we must update our own m_numRows/m_numCols
            SetDims(TensorShape(Value().GetNumRows()), Value().GetNumCols());
-            // BUGBUG: This looses the sample layout (tensor shape). It should be serialized as well.
+            // BUGBUG: This looses the sample layout (tensor shape). The caller must know this and fix it up if needed (currently needed for LearnableParameterNode).
        }

        // reader updated m_functionValue--update our internal state, i.e. m_numCols
@ -1403,7 +1399,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class C, class... _Types> inline shared_ptr<C> New(_Types&&... _Args)
    {
        return make_shared<C>(forward<_Types>(_Args)...);
-        //return ComputationNode<typename C::OurElemType>::template New<C>(forward<_Types>(_Args)...);
    }

    // =======================================================================
@ -1526,7 +1521,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #define UsingComputationNodeMembers /*without OperationName; needed to support inconsistent pattern of InputValue--TODO: This comment it out of date. */    \
 protected: \
    typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr; \
-    using Base::m_deviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
+    using Base::m_deviceId; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
    using Base::m_pMBLayout; using Base::GetNumTimeSteps; using Base::GetNumParallelSequences; \
    using Base::MaskMissingColumnsToZero; using Base::MaskMissingValueColumnsToZero; using Base::MaskMissingGradientColumnsToZero; using Base::InvalidateMissingValueColumns; using Base::InvalidateMissingGradientColumns; \
    using Base::DataFor; using Base::ValueFor; using Base::Gradient; using Base::GradientFor; \
@ -1540,12 +1535,12 @@ protected: \
    using Base::GetNumInputs; using Base::ZeroGradientsOfInputs; using Base::VerifyDims; \
    using Base::ConstOnes; \
    using Base::DetermineElementwiseTensorRank; \
-    using Base::GetInputSampleLayout; using Base::InferMBLayoutFromInputsForStandardCase; \
+    using Base::GetSampleLayout; using Base::GetInputSampleLayout; using Base::InferMBLayoutFromInputsForStandardCase; \
    using Base::CopyTo; using Base::CreateUniqNodeName; using Base::DetachInputs; using Base::GetInputsFromConfig; \
    using Base::DumpNodeInfo; using Base::EnumerateNodes; \
    using Base::HasMBLayout; using Base::GetMBLayout; using Base::LinkToMBLayout; \
    using Base::Input; using Base::SetInput; \
-    using Base::IsInputAnImage; using Base::IsEqualTo; using Base::IsOutputOlderThanInputs; using Base::IsLeaf; using Base::SetParameterUpdateRequired; \
+    using Base::IsEqualTo; using Base::IsOutputOlderThanInputs; using Base::IsLeaf; using Base::SetParameterUpdateRequired; \
    using Base::Load; \
    using Base::PrintNodeValuesToFile; using Base::PrintSelfBeforeValidation; \
    using Base::Save; using Base::UpdateFunctionMBSize; \
@ -1570,6 +1565,31 @@ protected:    /* some boilerplate goes here */ \
    // a few standard base classes for N-nary operations
    // =======================================================================

+    // -----------------------------------------------------------------------
+    // UnaryElementWiseNode (operand)
+    //
+    // unary elementwise operations that are implemented with the tensor lib
+    //
+    // Derived clases only need to override ForwardProp() and BackpropTo().
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class UnaryElementWiseNode : public ComputationNode<ElemType>, public NumInputs<1>
+    {
+        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
+    public:
+        UnaryElementWiseNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
+        {
+            ValidateUnaryMap(isFinalValidationPass);
+        }
+    };
+
+#define UsingUnaryElementwiseNodeBaseMembers UsingComputationNodeMembersBoilerplate;
+
    // -----------------------------------------------------------------------
    // BinaryElementWiseNode (operand1, operand2)
    //
@ -1598,13 +1618,9 @@ protected:    /* some boilerplate goes here */ \
 #endif
        }

-        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-        {
-            // By default, the BinaryElementWiseNode does not require any of it's input's values for computing
-            // the gradients of its input nodes
-            UNREFERENCED_PARAMETER(childIndex);
-            return false;
-        }
+        // By default, the BinaryElementWiseNode does not require any of it's input's values for computing
+        // the gradients of its input nodes
+        virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }

        virtual void /*IComputationNode::*/BeginForwardProp() override             // called before first iteration step of ForwardProp()
        {
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -30,9 +30,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // ConvolutionNode (convolutionWeights, inputFeature)
    // -----------------------------------------------------------------------

-    // convolutional network 
-    // This follows "high performance convolutional neural networks for document processing" by Kumar Chellapilla, Sidde Puri, and Patrice Simard.
-    // Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
+    // Convolutions (incl. pooling) support two different storage formats:
+    // BUGBUG: These are currently hard-selected depending on circumstances, without being reflected in TensoShape.
+    //
+    // * legacy mode (CPU and GPU without cudnn): Channels are tuples of scalars
+    //
+    //    This follows "high performance convolutional neural networks for document processing" by Kumar Chellapilla, Sidde Puri, and Patrice Simard.
+    //    Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
+    // 
+    //     - input :  [C  x W  x H      x T]  or  ARRAY[1..T] OF                ARRAY[1..H]  OF ARRAY[1..W]  OF ARRAY[1..C]
+    //     - output : [C' x W' x H'     x T]  or  ARRAY[1..T] OF                ARRAY[1..H'] OF ARRAY[1..W'] OF ARRAY[1..C']
+    //     - filter : [C' x W" x H" x C    ]  or                 ARRAY[1..C] OF ARRAY[1..H"] OF ARRAY[1..W"] OF ARRAY[1..C']
+    // 
+    // * GPU with cudnn: Channels are planes
+    // 
+    //     - input :   [W  x H  x C       x T]   or  ARRAY[1..T] OF                 ARRAY[1..C]  OF ARRAY[1..H]  OF ARRAY[1..W]
+    //     - output :  [W' x H' x      C' x T]   or  ARRAY[1..T] OF ARRAY[1..C'] OF                 ARRAY[1..H'] OF ARRAY[1..W']
+    //     - filter :  [W" x H" x C  x C'    ]   or                 ARRAY[1..C'] OF ARRAY[1..C]  OF ARRAY[1..H]  OF ARRAY[1..W]
+    // 
+    // where:
+    //  - using ' for output and " for filter
+    //  - T = samples (NVidia calls this N)
+    //  - W, H = width, height (W', H' for output, W", H" for kernel)
+    //  - C = input channels
+    //     - 3 for color images, 1 for B&W images
+    //     - for hidden layer: dimension of activation vector for each pixel
+    //  - C' = output channels = dimension of activation vector for each pixel (also called N by NVidia, inconsistently)
    template<class ElemType>
    class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
    {
@ -44,22 +67,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_kernelWidth(SIZE_MAX), m_kernelHeight(SIZE_MAX),
            // initialize to dummy values so we catch missing initialization
            m_horizontalSubsample(SIZE_MAX), m_verticalSubsample(SIZE_MAX),
-            m_zeroPadding(false), m_maxTempMemSizeInSamples(SIZE_MAX)
+            m_zeroPadding(false), m_maxTempMemSizeInSamples(SIZE_MAX),
+            m_imageLayoutKind(ImageLayoutKind::HWC)
        {
-            SetDims(ImageLayoutWHC(1, 1, 0), 0);           // TODO: what is this magic #channels == 0? Can this even be initialized at this time, or only inferred?
+            SetDims(ImageDimensions::AsTensorShape(1, 1, 0, m_imageLayoutKind), 0);
        }
-        ConvolutionNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0) :
+        ConvolutionNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
+                        const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0) :
            Base(deviceId, name),
+            m_outputChannels(outputChannels),
            m_kernelWidth(kernelWidth), m_kernelHeight(kernelHeight),
            m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample),
-            m_zeroPadding(zeroPadding), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples)
+            m_zeroPadding(zeroPadding), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples),
+            m_imageLayoutKind(imageLayoutKind)
        {
-            SetDims(ImageLayoutWHC(1, 1, outputChannels), 0);
-            m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId);
+            SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0); // TODO: necessary?
+            m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
        }
        ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp) :
            ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelWidth"), configp->Get(L"kernelHeight"), configp->Get(L"outputChannels"),
-                            configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"),
+                            configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")),
                            configp->Get(L"zeroPadding"), configp->Get(L"maxTempMemSizeInSamples"))
        {
            // weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0
@ -70,18 +97,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            Base::Save(fstream);
            fstream << m_kernelWidth << m_kernelHeight << m_horizontalSubsample << m_verticalSubsample;
-            fstream << m_sampleLayout.GetNumChannels();
+            uint32_t imageLayoutKind = (uint32_t)m_imageLayoutKind;
+            uint32_t outputChannels = (uint32_t)m_outputChannels;
+            fstream << outputChannels << imageLayoutKind;
            fstream << m_zeroPadding << m_maxTempMemSizeInSamples;
        }

        void Load(File& fstream, size_t modelVersion) override
        {
            Base::Load(fstream, modelVersion);
-            fstream >> m_kernelWidth >> m_kernelHeight >> m_horizontalSubsample >> m_verticalSubsample; 
-            size_t outputChannels;
-            fstream >> outputChannels;
-            SetDims(ImageLayoutWHC(1, 1, outputChannels), 0);
+            fstream >> m_kernelWidth >> m_kernelHeight >> m_horizontalSubsample >> m_verticalSubsample;
+            uint32_t imageLayoutKind, outputChannels;
+            fstream >> outputChannels >> imageLayoutKind;
+            m_imageLayoutKind = (ImageLayoutKind) imageLayoutKind;
+            m_outputChannels = outputChannels;
+            SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0);  // TODO: needed?
            fstream >> m_zeroPadding >> m_maxTempMemSizeInSamples;
+            m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
        }

        void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -100,6 +132,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples;

+                node->m_imageLayoutKind = m_imageLayoutKind;
+
                *node->m_tempMatrix = *m_tempMatrix;
            }
        }
@ -139,7 +173,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
            Matrix<ElemType> sliceOutputValue = ValueFor(fr);

-            // REVIEW alexeyk: setting batch size, can it be done elsewhere in a single place?  TODO: Yes, in BeginForwardProp().
+            // update the tensor dimension w.r.t. number of samples
            size_t batchSize = sliceInput1Value.GetNumCols();
            m_inT->setN(batchSize);
            m_outT->setN(batchSize);
@ -154,7 +188,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #endif
        }

-        // BUGBUG: Should not be here. Use PlusNode and m_sampleLayout.
+        // BUGBUG: Should not be here. Use PlusNode and m_sampleLayout.  TODO: Bad naming:'output' is actually an 'input'
        void AddBias(const Matrix<ElemType>& output, const Matrix<ElemType>& bias, Matrix<ElemType>& dst)
        {
            assert(m_convEng != nullptr);
@ -173,83 +207,80 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Base::Validate(isFinalValidationPass);
            InferMBLayoutFromInputsForStandardCase();

-            // get input tensor shape
-            auto inputSampleLayout = GetInputSampleLayout(1);
+            // get input and output tensor shape and interpret as image dimensions
+            auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);

-            if (inputSampleLayout.GetWidth() < m_kernelWidth || inputSampleLayout.GetHeight() < m_kernelHeight)
-                InvalidArgument("inputWidth must >= kernelWidth and inputHeight must >= kernelHeight.");
+            if (isFinalValidationPass && (inDims.m_width < m_kernelWidth || inDims.m_height < m_kernelHeight))
+                InvalidArgument("%ls %ls operation requires that input width be >= kernelWidth and input height >= kernelHeight.", NodeName().c_str(), OperationName().c_str());

            // determine output tensor shape
-            // WATCH OUT: Number of channels is tucked away in m_sampleLayout and must be propagated.
-            TensorShape outputSampleLayout;
-            if (m_zeroPadding)
-            {
-                const int kernelWidthCenter = m_kernelWidth % 2;
-                const int kernelHeightCenter = m_kernelHeight % 2;
-                outputSampleLayout = ImageLayoutWHC(
-                    (inputSampleLayout.GetWidth()  - kernelWidthCenter)  / m_horizontalSubsample + 1,
-                    (inputSampleLayout.GetHeight() - kernelHeightCenter) / m_verticalSubsample   + 1,
-                    m_sampleLayout.GetNumChannels());
-            }
-            else
-            {
-                outputSampleLayout = ImageLayoutWHC(
-                    (inputSampleLayout.GetWidth()  - m_kernelWidth)  / m_horizontalSubsample + 1,
-                    (inputSampleLayout.GetHeight() - m_kernelHeight) / m_verticalSubsample   + 1,
-                    m_sampleLayout.GetNumChannels());
-            }
+            const int kernelWidthCenter  = m_zeroPadding ?  m_kernelWidth % 2 : m_kernelWidth;
+            const int kernelHeightCenter = m_zeroPadding ? m_kernelHeight % 2 : m_kernelHeight;
+            auto outDims = ImageDimensions(
+                (inDims.m_width  - kernelWidthCenter)  / m_horizontalSubsample + 1,
+                (inDims.m_height - kernelHeightCenter) / m_verticalSubsample   + 1,
+                m_outputChannels);

-            size_t weightCols = m_kernelWidth * m_kernelHeight * inputSampleLayout.GetNumChannels();
+            size_t weightCols = m_kernelWidth * m_kernelHeight * inDims.m_numChannels;

+            // check/infer input [0] (weights)
            if (Input(0)->Value().HasNoElements())
-                ValidateInferInputDims(0, outputSampleLayout.GetNumChannels(), weightCols);
+                ValidateInferInputDims(0, m_outputChannels, weightCols);

-            if (isFinalValidationPass && (Input(0)->GetNumCols() != weightCols || Input(0)->GetNumRows() != outputSampleLayout.GetNumChannels()))
-                LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int)outputSampleLayout.GetNumChannels(), (int)weightCols);
+            if (isFinalValidationPass && (Input(0)->GetNumCols() != weightCols || Input(0)->GetNumRows() != m_outputChannels))
+                LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int)m_outputChannels, (int)weightCols);

-            size_t inputDim = inputSampleLayout.GetWidth() * inputSampleLayout.GetHeight() * inputSampleLayout.GetNumChannels();
+            // check/infer input [1] (data)
+            size_t inputDim = inDims.m_width * inDims.m_height * inDims.m_numChannels;
            if (Input(1)->GetNumRows() == 0)
                ValidateInferInputDims(1, inputDim, Input(1)->GetNumCols());

            if (isFinalValidationPass && Input(1)->GetNumRows() != inputDim)
-                LogicError("Each column of input to the convolution node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels.", NodeName().c_str(), (int)inputDim);
+                LogicError("Each column of inDims to the convolution node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels.", NodeName().c_str(), (int)inputDim);

            // that's our dimension
-            SetDims(outputSampleLayout, Input(1)->GetNumCols());
+            SetDims(outDims.AsTensorShape(m_imageLayoutKind), Input(1)->GetNumCols());

-            // set up the various engines and descriptor objects
-            // REVIEW alexeyk: is there a better place to create engines?
-            if (m_factory == nullptr)
-                m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId);
-            if (m_convEng == nullptr)
-                m_convEng = m_factory->CreateConvEngine(m_deviceId, m_maxTempMemSizeInSamples);
-            if (m_inT == nullptr)
-                m_inT = m_factory->CreateTensor(inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels(), 1);
-            if (m_filterT == nullptr)
-                m_filterT = m_factory->CreateFilter(m_kernelWidth, m_kernelHeight, inputSampleLayout.GetNumChannels(), m_sampleLayout.GetNumChannels());
-            if (m_outT == nullptr)
-                m_outT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
-            if (m_convDesc == nullptr)
-                m_convDesc = m_factory->CreateConvDescriptor(*m_inT, *m_filterT, m_horizontalSubsample, m_verticalSubsample, m_zeroPadding);
-            // REVIEW alexeyk: create per-channel (shared) bias. Consider adding other types of biases.
-            if (m_biasT == nullptr)
-                m_biasT = m_factory->CreateTensor(1, 1, m_sampleLayout.GetNumChannels(), 1);
+            if (isFinalValidationPass)
+            {
+                // set up the various engines and descriptor objects
+                // REVIEW alexeyk: is there a better place to create engines?
+                assert(m_factory);
+                //if (m_factory == nullptr)
+                //    m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
+                // TODO: This seems to expose too much internal knowlegde of the engine to the ConvolutionNode().
+                //       Why not just pass everything to the engine creator, and get one object that holds everything.
+                if (m_convEng == nullptr)
+                    m_convEng = m_factory->CreateConvEngine(m_deviceId, m_maxTempMemSizeInSamples);
+                if (m_inT == nullptr)
+                    m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
+                if (m_filterT == nullptr)
+                    m_filterT = m_factory->CreateFilter(m_kernelWidth, m_kernelHeight, inDims.m_numChannels, m_outputChannels);
+                if (m_outT == nullptr)
+                    m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
+                if (m_convDesc == nullptr)
+                    m_convDesc = m_factory->CreateConvDescriptor(*m_inT, *m_filterT, m_horizontalSubsample, m_verticalSubsample, m_zeroPadding);
+                // REVIEW alexeyk: create per-channel bias (shared across all pixels). Consider adding other types of biases.
+                if (m_biasT == nullptr)
+                    m_biasT = m_factory->CreateTensor(1, 1, outDims.m_numChannels, 1);
+            }
        }

        void DumpNodeInfo(const bool printValues, File& fstream) const override
        {
            Base::DumpNodeInfo(printValues, fstream);

-            auto inputSampleLayout = GetInputSampleLayout(1);
+            auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);
+            auto outDims = ImageDimensions(m_sampleLayout, m_imageLayoutKind);

            char str[4096];
-            sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu]  \n", inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels());
+            sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu]  \n", inDims.m_width, inDims.m_height, inDims.m_numChannels);
            fstream << string(str);
            sprintf(str, "Kernel[Width:%lu, Height:%lu]  SubSample[Horizontal:%lu, Vertical:%lu]\n", m_kernelWidth, m_kernelHeight, m_horizontalSubsample, m_verticalSubsample);
            fstream << string(str);
-            sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu]  \n", m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels());
+            sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu]  \n", outDims.m_width, outDims.m_height, outDims.m_numChannels);
            fstream << string(str);
-            sprintf(str, "ZeroPadding=%ls  maxTempMemSizeInSamples=%lu\n", m_zeroPadding? L"true" : L"false", m_maxTempMemSizeInSamples);
+            sprintf(str, "zeroPadding=%ls  maxTempMemSizeInSamples=%lu\n", m_zeroPadding? L"true" : L"false", m_maxTempMemSizeInSamples);
            fstream << string(str);
        }

@ -273,6 +304,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

    private:
+        size_t m_outputChannels;
+        size_t m_kernelWidth, m_kernelHeight;
+        size_t m_horizontalSubsample, m_verticalSubsample;
+        bool m_zeroPadding;
+        bool m_1DConvolutionOnGPUSparse;
+
+        shared_ptr<Matrix<ElemType>> m_tempMatrix;
+        size_t m_maxTempMemSizeInSamples;   // can change during runtime
+
+        ImageLayoutKind m_imageLayoutKind;  // how to interpret the tensor (which dimensions are X/Y and C)
+
        std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
        std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;

@ -281,14 +323,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        std::unique_ptr<ConvolutionTensor4D> m_outT;
        std::unique_ptr<ConvolutionDescriptor> m_convDesc;
        std::unique_ptr<ConvolutionTensor4D> m_biasT;
-
-        size_t m_kernelWidth, m_kernelHeight;
-        size_t m_horizontalSubsample, m_verticalSubsample;
-        bool m_zeroPadding;
-        bool m_1DConvolutionOnGPUSparse;
-
-        shared_ptr<Matrix<ElemType>> m_tempMatrix;
-        size_t m_maxTempMemSizeInSamples; // can change during runtime
    };

    template class ConvolutionNode<float>; 
@ -298,8 +332,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // PoolingNodeBase (input)
    // -----------------------------------------------------------------------

-    // Max/Average Pooling: support multi channel
-    // Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
    template<class ElemType>
    class PoolingNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
    {
@ -308,17 +340,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
            Base(deviceId, name),
            m_windowWidth(SIZE_MAX), m_windowHeight(SIZE_MAX),
-            m_horizontalSubsample(SIZE_MAX), m_verticalSubsample(SIZE_MAX)
+            m_horizontalSubsample(SIZE_MAX), m_verticalSubsample(SIZE_MAX),
+            m_imageLayoutKind(ImageLayoutKind::HWC)
        { }
-        PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) :
+        PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind) :
            Base(deviceId, name),
            m_windowWidth(windowWidth), m_windowHeight(windowHeight),
-            m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample)
+            m_horizontalSubsample(horizontalSubsample), m_verticalSubsample(verticalSubsample),
+            m_imageLayoutKind(imageLayoutKind)
        {
-            m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId);
+            m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
        }
        PoolingNodeBase(const ScriptableObjects::IConfigRecordPtr configp) :
-            PoolingNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"))
+            PoolingNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")))
        {
            // input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample
            AttachInputs(configp, this->GetExpectedNumInputs());
@ -327,13 +361,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void Save(File& fstream) const override
        {
            Base::Save(fstream);
-            fstream << m_windowWidth << m_windowHeight << m_horizontalSubsample << m_verticalSubsample;
+            uint32_t imageLayoutKind = (uint32_t)m_imageLayoutKind;
+            uint32_t windowWidth = (uint32_t)m_windowWidth;
+            fstream << windowWidth << imageLayoutKind << m_windowHeight << m_horizontalSubsample << m_verticalSubsample;
        }

        void Load(File& fstream, size_t modelVersion) override
        {
            Base::Load(fstream, modelVersion);
-            fstream >> m_windowWidth >> m_windowHeight >> m_horizontalSubsample >> m_verticalSubsample;
+            uint32_t imageLayoutKind, windowWidth;
+            fstream >> windowWidth >> imageLayoutKind >> m_windowHeight >> m_horizontalSubsample >> m_verticalSubsample;
+            m_windowWidth = windowWidth;
+            m_imageLayoutKind = (ImageLayoutKind)imageLayoutKind;
+            m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
        }

        void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -351,6 +391,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                node->m_inputSizePerSample = m_inputSizePerSample;
                node->m_outputSizePerSample = m_outputSizePerSample;
+
+                node->m_imageLayoutKind = m_imageLayoutKind;
            }
        }

@ -388,20 +430,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Base::Validate(isFinalValidationPass);
            InferMBLayoutFromInputsForStandardCase();

-            // get input tensor shape
-            auto inputSampleLayout = GetInputSampleLayout(0);
+            // get input tensor shape and interpret as image dimensions
+            auto inDims = ImageDimensions(GetInputSampleLayout(0), m_imageLayoutKind);

-            if (inputSampleLayout.GetWidth() < m_windowWidth || inputSampleLayout.GetHeight() < m_windowHeight)
+            if (isFinalValidationPass && (inDims.m_width < m_windowWidth || inDims.m_height < m_windowHeight))
                InvalidArgument("PoolingNodeBase: inputWidth must >= windowWidth and inputHeight must >= windowHeight.");

            // determine output tensor shape
-            auto outputSampleLayout = ImageLayoutWHC(
-                (inputSampleLayout.GetWidth()  - m_windowWidth)  / m_horizontalSubsample + 1,
-                (inputSampleLayout.GetHeight() - m_windowHeight) / m_verticalSubsample + 1,
-                inputSampleLayout.GetNumChannels());
+            auto outDims = ImageDimensions(
+                (inDims.m_width  - m_windowWidth)  / m_horizontalSubsample + 1,
+                (inDims.m_height - m_windowHeight) / m_verticalSubsample   + 1,
+                inDims.m_numChannels);

-            m_inputSizePerSample = inputSampleLayout.GetWidth() * inputSampleLayout.GetHeight() * inputSampleLayout.GetNumChannels();
-            //m_outputSizePerSample = outputSampleLayout.GetWidth() * outputSampleLayout.GetHeight() * outputSampleLayout.GetNumChannels();
+            m_inputSizePerSample = inDims.m_width * inDims.m_height * inDims.m_numChannels;

            if (Input(0)->GetNumRows() == 0)
                ValidateInferInputDims(0, m_inputSizePerSample, Input(0)->GetNumCols());    // TODO: We should infer a tensor dimension for the input instead.
@ -409,18 +450,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (isFinalValidationPass && Input(0)->GetNumRows() != m_inputSizePerSample)    // TODO: Can be removed once tensor shape and numRows are perfectly in sync.
                LogicError("each column of input to the MaxPooling node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels", NodeName().c_str(), (int)m_inputSizePerSample);

-            SetDims(outputSampleLayout, Input(0)->GetNumCols());
+            SetDims(outDims.AsTensorShape(m_imageLayoutKind), Input(0)->GetNumCols());

-            // set up various engines and descriptor objects
-            // REVIEW alexeyk: is there a better place to create engines?
-            if (m_factory == nullptr)
-                m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId);
-            if (m_poolEng == nullptr)
-                m_poolEng = m_factory->CreatePoolEngine(m_deviceId);
-            if (m_inT == nullptr)
-                m_inT = m_factory->CreateTensor(inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels(), 1);
-            if (m_outT == nullptr)
-                m_outT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
+            if (isFinalValidationPass)
+            {
+                // set up various engines and descriptor objects
+                // REVIEW alexeyk: is there a better place to create engines?
+                assert(m_factory);
+                //if (m_factory == nullptr)
+                //    m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
+                if (m_poolEng == nullptr)
+                    m_poolEng = m_factory->CreatePoolEngine(m_deviceId);
+                if (m_inT == nullptr)
+                    m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
+                if (m_outT == nullptr)
+                    m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
+            }
        }

        void DumpNodeInfo(const bool printValues, File& fstream) const override
@ -430,27 +475,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            auto inputSampleLayout = GetInputSampleLayout(0);

            char str[4096];
-            sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu]  \n", inputSampleLayout.GetWidth(), inputSampleLayout.GetHeight(), inputSampleLayout.GetNumChannels());
+            sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu]  \n", inputSampleLayout[1], inputSampleLayout[2], inputSampleLayout[0]);
            fstream << string(str);
            sprintf(str, "PoolingWindow[Width:%lu, Height:%lu]  SubSampling[Horizontal:%lu, Vertical:%lu]\n", m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample);
            fstream << string(str);
-            sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu]  \n", m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels());
+            sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu]  \n", m_sampleLayout[1], m_sampleLayout[2], m_sampleLayout[0]);
            fstream << string(str);
            sprintf(str, "TotalSizePerSample[Input:%lu, Output:%lu]  \n", m_inputSizePerSample, m_outputSizePerSample);
            fstream << string(str);
        }

    protected:
+        size_t m_windowWidth, m_windowHeight;
+        size_t m_horizontalSubsample, m_verticalSubsample;
+        size_t m_inputSizePerSample, m_outputSizePerSample;
+
+        ImageLayoutKind m_imageLayoutKind;  // how to interpret the tensor (which dimensions are X/Y and C)
+
        std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
        std::unique_ptr<PoolingEngine<ElemType>> m_poolEng;

        std::unique_ptr<ConvolutionTensor4D> m_inT;
        std::unique_ptr<ConvolutionTensor4D> m_outT;
        std::unique_ptr<PoolingDescriptor> m_poolDesc;
-
-        size_t m_windowWidth, m_windowHeight;
-        size_t m_horizontalSubsample, m_verticalSubsample;
-        size_t m_inputSizePerSample, m_outputSizePerSample;
    };

    // add this at the start of each derived class, to get access to the members of ComputationNode
@ -471,8 +518,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static const std::wstring TypeName() { return L"MaxPooling"; }
    public:
        MaxPoolingNode(DEVICEID_TYPE deviceId, const wstring & name) : Base(deviceId, name) { }
-        MaxPoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) :
-            Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample)
+        MaxPoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind) :
+            Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind)
        { }
        MaxPoolingNode(const ScriptableObjects::IConfigRecordPtr configp) :
            Base(configp)
@ -481,7 +528,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void Validate(bool isFinalValidationPass) override
        {
            Base::Validate(isFinalValidationPass);
-            if (m_poolDesc == nullptr)
+            if (isFinalValidationPass && m_poolDesc == nullptr)
                m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Max, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
        }
    };
@ -500,8 +547,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static const std::wstring TypeName() { return L"AveragePooling"; }
    public:
        AveragePoolingNode(DEVICEID_TYPE deviceId, const wstring & name) : Base(deviceId, name) { }
-        AveragePoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) :
-            Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample)
+        AveragePoolingNode(DEVICEID_TYPE deviceId, const wstring & name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind) :
+            Base(deviceId, name, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayoutKind)
        { }
        AveragePoolingNode(const ScriptableObjects::IConfigRecordPtr configp) :
            Base(configp)
@ -525,7 +572,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void Validate(bool isFinalValidationPass) override
        {
            Base::Validate(isFinalValidationPass);
-            if (m_poolDesc == nullptr)
+            if (isFinalValidationPass && m_poolDesc == nullptr)
                m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Average, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
        }
    };
@ -573,6 +620,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            // Read and check version.
            // REVIEW alexeyk: extract version checking so it can be re-used in other places.
+            // BUGBUG: We must serialize m_inputLayout.
            int32_t verWritten;
            int32_t verReadable;
            fstream >> verWritten >> verReadable;
@ -683,18 +731,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            SetDims(Input(0));

-            if (m_factory == nullptr)
-                m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId);
-            if (m_convEng == nullptr)
-                m_convEng = m_factory->CreateConvEngine(m_deviceId, 0);
-            if (m_inT == nullptr)
-                m_inT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
-            if (m_scaleBiasT == nullptr)
+            if (isFinalValidationPass)
            {
-                if (m_spatial)
-                    m_scaleBiasT = m_factory->CreateTensor(1, 1, m_sampleLayout.GetNumChannels(), 1);
-                else
-                    m_scaleBiasT = m_factory->CreateTensor(m_sampleLayout.GetWidth(), m_sampleLayout.GetHeight(), m_sampleLayout.GetNumChannels(), 1);
+const auto m_imageLayoutKind = ImageLayoutKind::CHW;        // BUGBUG: Finish this. Must be serialized.
+                auto dims = ImageDimensions(GetSampleLayout(), m_imageLayoutKind);
+
+                if (m_factory == nullptr)
+                    m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
+                if (m_convEng == nullptr)
+                    m_convEng = m_factory->CreateConvEngine(m_deviceId, 0);
+                if (m_inT == nullptr)
+                    m_inT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
+                if (m_scaleBiasT == nullptr)
+                {
+                    if (m_spatial)
+                        m_scaleBiasT = m_factory->CreateTensor(1, 1, dims.m_numChannels, 1);
+                    else
+                        m_scaleBiasT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
+                }
            }
        }

@ -740,11 +794,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        VersionInfo m_version;

    private:
-        std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
-        std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
-        std::unique_ptr<ConvolutionTensor4D> m_inT;
-        std::unique_ptr<ConvolutionTensor4D> m_scaleBiasT;
-
        // Determines whether to use training or inference(evaluation) mode.
        bool m_eval;
        // Determines whether to use per-activation (used after non-convolutional layers like fully connected)
@ -760,6 +809,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        shared_ptr<Matrix<ElemType>> m_dScale;
        // Stores bias derivatives.
        shared_ptr<Matrix<ElemType>> m_dBias;
+
+        std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
+        std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
+        std::unique_ptr<ConvolutionTensor4D> m_inT;
+        std::unique_ptr<ConvolutionTensor4D> m_scaleBiasT;
    };

    template class BatchNormalizationNode<float>; 
--- a/Source/ComputationNetworkLib/EsotericNodes.h
+++ b/Source/ComputationNetworkLib/EsotericNodes.h
@ -18,6 +18,635 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // This header collects special-purpose nodes.
    // It is likely that these are no longer functional.

+#ifndef ENABLE_BROADCASTING_ELEMENTTIMES
+    // -----------------------------------------------------------------------
+    // ScaleNode (scalar scaling factor, matrix)
+    //
+    // Identical to ElementTimesNode with tensor lib (broadcasting). Can be removed.
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class ScaleNode : public ComputationNode<ElemType>, public NumInputs<2>
+    {
+        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+        static const std::wstring TypeName() { return L"Scale"; }
+    public:
+        DeclareConstructorFromConfigWithNumInputs(ScaleNode);
+        ScaleNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
+        {
+#ifdef ENABLE_TENSORVIEW    // This takes a big perf hit since our reduction uses only a single thread in this case. Needs to be fixed.
+            size_t rank = DetermineElementwiseTensorRank();
+            auto gradient = GradientTensorFor(rank, fr);
+            auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
+            auto otherInputValue = Input(1 - inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
+
+            // if reduction then mask the respective input(s) (zero out the gaps)
+            if (Input(inputIndex)->GetNumCols() < GetNumCols())
+                MaskMissingGradientColumnsToZero(fr);
+            if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
+                Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
+
+            inputGradient.AddElementwiseProductOf(gradient, otherInputValue);
+#else
+            if (inputIndex == 0)        // left derivative
+            {
+                // this is a reduction over frames, so we must mask gaps to zero
+                Input(0)->Gradient() += Matrix<ElemType>::InnerProductOfMatrices(MaskedGradientFor(fr), Input(1)->MaskedValueFor(fr)); // element-wise product summed up over all
+            }
+            else if (inputIndex == 1)   // right derivative
+            {
+                Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
+                Matrix<ElemType>::Multiply1x1AndWeightedAdd(+1.0f, Input(0)->Value()/*1x1*/, GradientFor(fr), 1.0f, sliceInput1Grad);
+            }
+#endif
+        }
+
+        virtual bool OutputUsedInComputingInputNodesGradients() const override
+        {
+            // The ScaleNode does not require its output value for computing
+            // the gradients of its input nodes
+            return false;
+        }
+
+        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override  
+        {
+#ifdef ENABLE_TENSORVIEW
+            static int c = 0; if (c++ == 0) { fprintf(stderr, "#SCALE#\n"); }
+            size_t rank = DetermineElementwiseTensorRank();
+            auto result = ValueTensorFor(rank, fr);
+            auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
+            auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
+            result.AssignElementwiseProductOf(input0, input1);
+#else
+            ValueFor(fr).Assign1x1ProductOf(Input(0)->Value()/*1x1*/, Input(1)->ValueFor(fr));
+#endif
+        }
+
+        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
+        {
+            Base::Validate(isFinalValidationPass);
+            InferMBLayoutFromInputsForStandardCase();
+
+            // left node must be a scalar
+            if (isFinalValidationPass && (Input(0)->GetNumRows() != 1 || Input(0)->GetNumCols() != 1))
+                RuntimeError("The left value of ScaleNode must be a scalar value.");
+
+            SetDims(Input(1));
+        }
+    };
+
+    template class ScaleNode<float>; 
+    template class ScaleNode<double>;
+
+    // -----------------------------------------------------------------------
+    // RowElementTimesNode (left, right)  --TODO: what are left and right?
+    //
+    // TODO: This is subsumed by ElementTimes with tensor lib.
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class RowElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
+    {
+        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+        static const std::wstring TypeName() { return L"RowElementTimes"; }
+    public:
+        DeclareConstructorFromConfigWithNumInputs(RowElementTimesNode);
+        RowElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        void BackpropToMap(const size_t inputIndex)
+        {
+            if (inputIndex > 1)
+                InvalidArgument("RowElementTimes operation only takes two inputs.");
+
+            if (inputIndex == 0)
+            {
+                BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
+            }
+            else
+            {
+                BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
+            }
+        }
+
+        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
+        {
+            if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
+            Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
+            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
+
+            Matrix<ElemType> sliceInput1Value = Input(1 - inputIndex)->ValueFor(fr);
+
+            if (inputIndex == 0)
+            {
+                BackpropToLeftS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
+            }
+            else
+            {
+                BackpropToRightS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
+            }
+        }
+
+        virtual bool OutputUsedInComputingInputNodesGradients() const override
+        {
+            // The RowElementTimesNode does not require its output value for computing
+            // the gradients of its input nodes
+            return false;
+        }
+
+        //left (input 0) is a matrix
+        /*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
+            Matrix<ElemType>& input0GradientValues, 
+            const Matrix<ElemType>& gradientValues, 
+            Matrix<ElemType>& tempMatrix)
+        {
+            tempMatrix.SetValue(gradientValues);
+            tempMatrix.RowElementMultiplyWith(input1FunctionValues);
+            input0GradientValues += tempMatrix;
+
+#if NANCHECK
+            input0GradientValues.HasNan("RowElementTimes");
+#endif
+        }
+
+        //right (input 1) is a row vector
+        /*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues, 
+            Matrix<ElemType>& input1GradientValues, 
+            const Matrix<ElemType>& gradientValues, 
+            Matrix<ElemType>& tempMatrix)
+        {
+            tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, true);
+            input1GradientValues += tempMatrix;
+
+#if NANCHECK
+            input1GradientValues.HasNan("RowElementTimes");
+#endif
+        }
+        void ForwardPropMap()    // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
+        {
+            ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
+        }
+
+        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
+        {
+            //if (fr.IsAllFrames()) { ForwardPropMap(); return; }
+            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
+            Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
+            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+
+            ForwardPropS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
+        }
+
+        /*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
+        {
+            functionValues.SetValue(input0);
+            functionValues.RowElementMultiplyWith(input1);
+
+#if NANCHECK
+            functionValues.HasNan("RowElementTimes");
+#endif
+        }
+
+        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
+        {
+            Base::Validate(isFinalValidationPass);
+            InferMBLayoutFromInputsForStandardCase();
+
+            size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
+            size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); rows0;
+            if (isFinalValidationPass && cols0 != cols1 || rows1 != 1)
+                LogicError("RowElementTimes: Either the second operand is not a row vector or the number of columns of operands does not match.");
+
+            SetDims(Input(0));
+        }
+
+        //request matrices that are needed for gradient computation
+        virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
+        {
+            Base::RequestMatricesBeforeBackprop(matrixPool);
+            RequestMatrixFromPool(m_tempMatrix, matrixPool);
+        }
+
+        //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
+        virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+        {
+            Base::ReleaseMatricesAfterBackprop(matrixPool);
+            ReleaseMatrixToPool(m_tempMatrix, matrixPool);
+        }
+
+    private:
+        shared_ptr<Matrix<ElemType>> m_tempMatrix;
+    };
+
+    template class RowElementTimesNode<float>;
+    template class RowElementTimesNode<double>;
+
+    // -----------------------------------------------------------------------
+    // ColumnElementTimesNode (left, right)  --TODO: what are left and right?
+    //
+    // TODO: This is subsumed by ElementTimes with tensor lib.
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class ColumnElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
+    {
+        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+        static const std::wstring TypeName() { return L"ColumnElementTimes"; }
+    public:
+        DeclareConstructorFromConfigWithNumInputs(ColumnElementTimesNode);
+        ColumnElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        void BackpropToMap(const size_t inputIndex)
+        {
+            if (inputIndex > 1)
+                InvalidArgument("ColumnElementTimes operation only takes two inputs.");
+
+            if (inputIndex == 0)
+            {
+                BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
+            }
+            else
+            {
+                BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
+            }
+        }
+
+        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
+        {
+            if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
+            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
+
+            if (inputIndex == 0)
+            {
+                Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
+
+                BackpropToLeftS(Input(1)->Value(), sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
+            }
+            else
+            {
+                Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
+                BackpropToRightS(sliceInput0Value, Input(1)->Gradient(), sliceOutputGrad, *m_tempMatrix);
+            }
+        }
+
+        virtual bool OutputUsedInComputingInputNodesGradients() const override
+        {
+            // The ColumnElementTimesNode does not require its output value for computing
+            // the gradients of its input nodes
+            return false;
+        }
+
+        //left (input 0) is a matrix
+        /*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
+            Matrix<ElemType>& input0GradientValues,
+            const Matrix<ElemType>& gradientValues,
+            Matrix<ElemType>& tempMatrix)
+        {
+            tempMatrix.SetValue(gradientValues);
+            tempMatrix.ColumnElementMultiplyWith(input1FunctionValues);
+            input0GradientValues += tempMatrix;
+
+#if NANCHECK
+            input0GradientValues.HasNan("ColumnElementTimes");
+#endif
+        }
+
+        //right (input 1) is a col vector
+        /*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
+            Matrix<ElemType>& input1GradientValues,
+            const Matrix<ElemType>& gradientValues,
+            Matrix<ElemType>& tempMatrix)
+        {
+            tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, false);
+            input1GradientValues += tempMatrix;
+
+#if NANCHECK
+            input1GradientValues.HasNan("ColumnElementTimes");
+#endif
+        }
+        void ForwardPropMap()    // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
+        {
+            ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
+        }
+
+        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
+        {
+            //if (fr.IsAllFrames()) { ForwardPropMap(); return; }
+            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
+            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+
+            ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value());
+        }
+
+        /*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
+        {
+            functionValues.SetValue(input0);
+            functionValues.ColumnElementMultiplyWith(input1);
+
+#if NANCHECK
+            functionValues.HasNan("ColumnElementTimes");
+#endif
+        }
+
+        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
+        {
+            Base::Validate(isFinalValidationPass);
+            InferMBLayoutFromInputsForStandardCase();
+
+            //derive number of rows if possible
+            for (size_t index = 0; index < 2; index++)
+            {
+                size_t rows = Input(index)->GetNumRows() == 0 ? Input(1 - index)->GetNumRows() : Input(index)->GetNumRows();
+                size_t cols = Input(index)->GetNumCols() == 0 ? Input(1 - index)->GetNumCols() : Input(index)->GetNumCols();
+                ValidateInferInputDims(index, rows, cols);
+            }
+
+            size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
+            size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); cols0;
+            if (isFinalValidationPass && (rows0 != rows1 || cols1 != 1))
+                LogicError("ColumnElementTimes: Either the second operand is not a column vector or the number of rows of operands does not match.");
+
+            SetDims(Input(0));
+        }
+
+        //request matrices that are needed for gradient computation
+        virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
+        {
+            Base::RequestMatricesBeforeBackprop(matrixPool);
+            RequestMatrixFromPool(m_tempMatrix, matrixPool);
+        }
+
+        //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
+        virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+        {
+            Base::ReleaseMatricesAfterBackprop(matrixPool);
+            ReleaseMatrixToPool(m_tempMatrix, matrixPool);
+        }
+
+    private:
+        shared_ptr<Matrix<ElemType>> m_tempMatrix;
+    };
+
+    template class ColumnElementTimesNode<float>;
+    template class ColumnElementTimesNode<double>;
+
+    // -----------------------------------------------------------------------
+    // RectifiedLinearNode (input) -- ReLU non-linearity
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class RectifiedLinearNode : public SoftmaxNodeBase<ElemType>
+    {
+        typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
+        static const std::wstring TypeName() { return L"RectifiedLinear"; }
+    public:
+        DeclareConstructorFromConfigWithNumInputs(RectifiedLinearNode);
+        RectifiedLinearNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override
+        {
+            gradient.AssignLinearRectifierDerivativeOf(inputFunctionValues);
+#if DUMPOUTPUT
+            inputGradientValues.Print("RecitifiedLinearNode-Partial-in");
+#endif
+            inputGradientValues.AddElementProductOf(gradientValues, gradient);
+#if DUMPOUTPUT
+            inputGradientValues.Print("RecitifiedLinearNode-Partial-out");
+#endif
+        }
+
+        virtual bool OutputUsedInComputingInputNodesGradients() const override
+        {
+            // The ReLU node does not require its output value for computing
+            // the gradients of its input nodes
+            return false;
+        }
+
+        void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+        {
+            functionValues.AssignTruncateBottomOf(inputFunctionValues, 0);
+#if DUMPOUTPUT
+            functionValues.Print("RectifiedLinearNode");
+#endif
+        }
+    };
+
+    template class RectifiedLinearNode<float>;
+    template class RectifiedLinearNode<double>;
+
+    // -----------------------------------------------------------------------
+    // SigmoidNode (input) -- sigmoid non-linearity
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class SigmoidNode : public SoftmaxNodeBase<ElemType>
+    {
+        typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
+        static const std::wstring TypeName() { return L"Sigmoid"; }
+    public:
+        DeclareConstructorFromConfigWithNumInputs(SigmoidNode);
+        SigmoidNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
+        {
+            // The Sigmoid node does not require any of it's input's values for computing
+            // the gradients of its input nodes
+            UNREFERENCED_PARAMETER(childIndex);
+            return false;
+        }
+
+        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
+        {
+            gradient.AssignSigmoidDerivativeOf(functionValues);
+            inputGradientValues.AddElementProductOf(gradientValues, gradient);
+        }
+
+        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+        {
+            functionValues.AssignSigmoidOf(inputFunctionValues);
+        }
+    };
+
+    template class SigmoidNode<float>;
+    template class SigmoidNode<double>;
+
+    // -----------------------------------------------------------------------
+    // TanhNode (input) -- tanh non-linearity
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class TanhNode : public SoftmaxNodeBase<ElemType>
+    {
+        typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
+        static const std::wstring TypeName() { return L"Tanh"; }
+    public:
+        DeclareConstructorFromConfigWithNumInputs(TanhNode);
+        TanhNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
+        {
+            // The plus node does not require any of it's input's values for computing
+            // the gradients of its input nodes
+            UNREFERENCED_PARAMETER(childIndex);
+            return false;
+        }
+
+        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
+        {
+            gradient.AssignElementProductOf(functionValues, functionValues); // v .* v
+            gradient.AssignDifferenceOf(1, gradient); // 1-v^2
+
+            inputGradientValues.AddElementProductOf(gradientValues, gradient); // += d .* ((1-v) .* v))
+        }
+
+        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+        {
+            functionValues.AssignTanhOf(inputFunctionValues);
+        }
+    };
+
+    template class TanhNode<float>;
+    template class TanhNode<double>;
+
+    // -----------------------------------------------------------------------
+    // LogNode (input) -- component-wise log() of input
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class LogNode : public SoftmaxNodeBase<ElemType>
+    {
+        typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
+        static const std::wstring TypeName() { return L"Log"; }
+    public:
+        DeclareConstructorFromConfigWithNumInputs(LogNode);
+        LogNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        virtual bool OutputUsedInComputingInputNodesGradients() const override
+        {
+            // The plus node does not require its output value for computing
+            // the gradients of its input nodes
+            return false;
+        }
+
+        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
+        {
+            gradient.AssignElementInverseOf(inputFunctionValues); // 1/x (x is input to log(x))
+            inputGradientValues.AddElementProductOf(gradientValues, gradient);
+            // TODO: with tensor lib:
+            //inputGradientValues.AddElementDivisionOf(gradientValues, inputFunctionValues); // 1/x (x is input to log(x))
+        }
+
+        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+        {
+            functionValues.AssignLogOf(inputFunctionValues);
+        }
+    };
+
+    template class LogNode<float>;
+    template class LogNode<double>;
+
+    // -----------------------------------------------------------------------
+    // ExpNode (input) -- component-wise exp() of input
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class ExpNode : public SoftmaxNodeBase<ElemType>
+    {
+        typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
+        static const std::wstring TypeName() { return L"Exp"; }
+    public:
+        DeclareConstructorFromConfigWithNumInputs(ExpNode);
+        ExpNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
+        {
+            assert(inputIndex == 0); inputIndex;
+
+            Matrix<ElemType> sliceInputGrad = Input(0)->GradientFor(fr);
+            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
+            Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
+
+            m_gradientTemp->AssignExpOf(sliceInputValue); // Exp(x) is its own partial
+            sliceInputGrad.AddElementProductOf(sliceOutputGrad, *m_gradientTemp);
+            // TODO: with tensor lib:
+            // sliceInputGrad.AddElementProductOf(sliceOutputGrad, functionValues);
+            // and set OutputUsed
+        }
+
+        virtual bool OutputUsedInComputingInputNodesGradients() const override
+        {
+            // The ExpNode does not require its output value for computing
+            // the gradients of its input nodes
+            return false;
+        }
+
+        virtual void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override { NOT_IMPLEMENTED; }   // not needed
+
+        void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+        {
+            functionValues.AssignExpOf(inputFunctionValues);
+        }
+    };
+
+    template class ExpNode<float>;
+    template class ExpNode<double>;
+
+    // -----------------------------------------------------------------------
+    // CosineNode (input) -- component-wise cos() of input
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class CosineNode : public SoftmaxNodeBase<ElemType>
+    {
+        typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
+        static const std::wstring TypeName() { return L"Cosine"; }
+    public:
+        DeclareConstructorFromConfigWithNumInputs(CosineNode);
+        CosineNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        virtual bool OutputUsedInComputingInputNodesGradients() const override
+        {
+            // The CosineNode does not require its output value for computing
+            // the gradients of its input nodes
+            return false;
+        }
+
+        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
+        {
+            gradient.AssignNegativeSineOf(inputFunctionValues); // -sin(x) (x is input to Cosine(x))
+            inputGradientValues.AddElementProductOf(gradientValues, gradient);
+            // TODO: tensor lib: make a joint kernel, since neg sin is never used for anything else
+        }
+
+        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+        {
+            functionValues.AssignCosineOf(inputFunctionValues);
+        }
+    };
+
+    template class CosineNode<float>;
+    template class CosineNode<double>;
+#endif
+
    // -----------------------------------------------------------------------
    /// DummyCriterionNode (objectives, derivatives, prediction)
    // -----------------------------------------------------------------------
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@ -28,6 +28,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // -----------------------------------------------------------------------
    // LearnableParameter (/*no input*/)
    // represents weight matrices and biases
+    // TODO: add -Node to the class name
    // -----------------------------------------------------------------------

    template<class ElemType>
@ -42,18 +43,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_parameterUpdateRequired = true;
            SetDims(TensorShape(), 0);
        }
-        LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
+        LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & shape) :
            Base(deviceId, name)
        {
            m_parameterUpdateRequired = true;
            CreateMatrixIfNull(m_value);
-            SetDims(TensorShape(rows), cols);
+            // for now we split off the trailing dimension into the matrix column dimension
+            // TODO: This is for compat, but is is inconsistent. Decide what a sample layout means for a node without MBLayout w.r.t. non-tensor ops.
+            auto dims = shape.GetDims();
+            size_t cols = 1;
+            if (dims.size() > 1)
+            {
+                cols = dims.back();
+                dims.resize(dims.size()-1);
+            }
+            SetDims(TensorShape(dims), cols);
            UpdateFunctionValuesSize();   // this allocates the matrix
            Value().SetValue(0);
        }
+        LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
+            LearnableParameter(deviceId, name, TensorShape(rows, cols))
+        { }
        LearnableParameter(const ScriptableObjects::IConfigRecordPtr configp) :
-            LearnableParameter(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"rows"), configp->Get(L"cols"))
+            LearnableParameter(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"shape"))
        {
+            // TODO: Change dimensions to take a generic tensor instead. That will be a (minor) breaking change that will require fix-ups when converting from NDL to BrainScript.
            AttachInputs(configp, this->GetExpectedNumInputs());
            // parameters[rows, [cols=1]] plus other optional parameters (needGradient=[true|false], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float])
            // TODO: "needGradient" should be renamed to better match m_parameterUpdateRequired
@ -83,7 +97,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            Base::Save(fstream);
            fstream << m_parameterUpdateRequired;
-            fstream << GetNumRows() << GetNumCols(); 
+            fstream << (size_t)0/*#rows in a legacy file format*/ << GetNumCols();
+            m_sampleLayout.Save(fstream);
            fstream << Value();
        }

@ -95,8 +110,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            fstream >> m_parameterUpdateRequired;
            fstream >> rows >> cols;

-            SetDims(TensorShape(rows), cols);
+            TensorShape sampleLayout;
+            if (rows != 0)      // legacy file format
+                sampleLayout = TensorShape(rows);
+            else
+                sampleLayout.Load(fstream, /*acceptLegacyFormat=*/true);
            LoadValue(fstream);
+            SetDims(sampleLayout, cols);    // note: call this after LoadValue() since LoadValue() overwrites m_sampleLayout
        }

        // initialize with random numbers
@ -106,13 +126,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        bool initOnCPUOnly) // if true then always init on CPU, making initialization consistent across both (for testing)
        {
            size_t inputSize = GetNumCols();
+            //fprintf(stderr, "%d x %d: %d  %ls\n", (int)GetNumRows(), (int)GetNumCols(), (int)randomSeed, NodeName().c_str());

            // the random seed offset is set via the "randomSeedOffset" parameter in config
            if (initOnCPUOnly)
                m_value->TransferToDeviceIfNotThereAndNotAutoPlace(CPUDEVICE, true);
            if (uniformInit)
            {
-                ElemType randRange = 0.05f * initValueScale; //initValueScale/sqrt(inputSize);
+                // TODO: move these crazy extra factors out from here and into NDL, and make them visible in BS
+                ElemType randRange = 0.05f * initValueScale;
                Value().SetUniformRandomValue(-randRange, randRange, randomSeed);
            }
            else
@ -221,6 +243,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // InputValueBase (/*no input*/)
    // Base class for InputValue and SparseInputValue (typically fed by a DataReader)
    // this covers four types: (regular vs. image) x (non-sparse vs. sparse)
+    // TODO: add -Node to the class names
    // -----------------------------------------------------------------------

    template<class ElemType>
@ -228,59 +251,47 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;

-        void Init(const TensorShape & sampleLayout, size_t cols, bool isSparse)
+        void Init(const TensorShape & sampleLayout, bool isSparse)
        {
            m_isSparse = isSparse;
            CreateMatrixIfNull(m_value);
            if (isSparse)
                ConvertToSparseMatrix();

-            SetDims(sampleLayout, cols);
+            SetDims(sampleLayout, 0);
            UpdateFunctionValuesSize();     // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
            m_parameterUpdateRequired = false;
        }
    protected:
+        InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout, bool isSparse) :
+            Base(deviceId, name)
+        {
+            Init(sampleLayout, isSparse);
+        }
+        InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, bool isSparse) :
+            InputValueBase(deviceId, name, TensorShape(rows), isSparse)
+        { }
        InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, bool isSparse) :
-            Base(deviceId, name)
-        {
-            Init(TensorShape(), 0, isSparse);
-        }
-        InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols, bool isSparse) :
-            Base(deviceId, name)
-        {
-            Init(TensorShape(rows), cols, isSparse);
-        }
-        InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout, size_t numImages, bool isSparse) :
-            Base(deviceId, name)
-        {
-            size_t cols = numImages;
-            Init(imageLayout, cols, isSparse);
-        }
+            InputValueBase(deviceId, name, TensorShape(), isSparse)
+        { }
        InputValueBase(const ScriptableObjects::IConfigRecordPtr configp, bool isSparse) :
            Base(configp->Get(L"deviceId"), L"<placeholder>")
        {
            AttachInputs(configp, this->GetExpectedNumInputs());
            bool isImage  = configp->Get(L"isImage");
            if (!isImage)
-            {
-                size_t rows = configp->Get(L"rows");
-                size_t cols = configp->Get(L"cols");
-                Init(TensorShape(rows), cols, isSparse);         // no tensor, just a vector
-            }
+                Init(configp->Get(L"shape"), isSparse);
            else
-            {
-                size_t cols = configp->Get(L"numImages");       // This is actually the MB size.  --TODO: No need to specify it?
-                Init(ImageLayoutWHC(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels")), cols, isSparse);
-            }
+                Init(ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))), isSparse);
        }
    public:

        virtual void Save(File& fstream) const override
        {
            Base::Save(fstream);
-            size_t rows = GetNumRows();                     // using explicitly typed variables to be 100% symmetrical to Load()
-            size_t cols = m_pMBLayout ? 0 : GetNumCols();   // if this Input depends on MB size, we write it as having 0 dimensions
-            fstream << rows << cols;
+            size_t rows = GetNumRows();     // using explicitly typed variables to be 100% symmetrical to Load()
+            size_t colsDummy = 0;           // This should not be saved. InputValues always are minibatches.
+            fstream << rows << colsDummy;
            m_sampleLayout.Save(fstream);
        }

@ -288,13 +299,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            Base::Load(fstream, modelVersion);

-            size_t rows, cols;
-            fstream >> rows >> cols;
-            // some older files retained the #columns when saving, which is meaningless
-            if (m_pMBLayout)
-                cols = 0;
+            size_t rows, colsDummy;
+            fstream >> rows >> colsDummy;
            TensorShape sampleLayout;
-            sampleLayout.Load(fstream);
+            sampleLayout.Load(fstream, /*acceptLegacyFormat=*/true);
            // some older files may have inconsistent tensor information
            if (rows != sampleLayout.GetNumElements())
            {
@ -302,7 +310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        NodeName().c_str(), string(sampleLayout).c_str(), (int)rows);
                sampleLayout = TensorShape(rows);
            }
-            Init(sampleLayout, cols, m_isSparse);
+            Init(sampleLayout, m_isSparse);
        }

        // InputValue must not resize its inputs because that might destroy it. It should already have the correct size.
@ -347,11 +355,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        InputValue(DEVICEID_TYPE deviceId, const wstring & name) :
            Base(deviceId, name, false)
        { }
-        InputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
-            Base(deviceId, name, rows, cols, false)
+        InputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows) :
+            Base(deviceId, name, rows, false)
        { }
-        InputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout, size_t numImages) :
-            Base(deviceId, name, imageLayout, numImages, false)
+        InputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout) :
+            Base(deviceId, name, sampleLayout, false)
        { }
        InputValue(const ScriptableObjects::IConfigRecordPtr configp) :
            Base(configp, false)
@ -376,11 +384,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name) :
            Base(deviceId, name, true)
        { }
-        SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
-            Base(deviceId, name, rows, cols, true)
+        SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, size_t rows) :
+            Base(deviceId, name, rows, true)
        { }
-        SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout, size_t numImages) :
-            Base(deviceId, name, imageLayout, numImages, true)
+        SparseInputValue(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & imageLayout) :
+            Base(deviceId, name, imageLayout, true)
        { }
        SparseInputValue(const ScriptableObjects::IConfigRecordPtr configp) :
            Base(configp, true)
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -6,10 +6,10 @@
 #pragma once

 #include "Basics.h"
-#include "Matrix.h"
-#include "TensorView.h"
 #include "ComputationNode.h"
 #include "ConvolutionalNodes.h"
+#include "Matrix.h"
+#include "TensorView.h"

 #include <unordered_set>
 #include <map>
@ -44,7 +44,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
        {
 #ifdef ENABLE_TENSORVIEW
-            // BUGBUG: This gives us a huge perf hit for Image/QuickE2E.
+            static int c = 0; if (c++ == 0) { fprintf(stderr, "#PLUSBP#\n"); }
            size_t rank = DetermineElementwiseTensorRank();
            auto gradient      =                    GradientTensorFor(rank, fr);
            auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
@ -53,7 +53,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (Input(inputIndex)->GetNumCols() < GetNumCols())
                MaskMissingGradientColumnsToZero(fr);

-            inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
+            inputGradient.AddCopyOf(gradient);
 #else
            Matrix<ElemType> gradientValues = GradientFor(fr);
            Matrix<ElemType> functionValues = ValueFor(fr);
@ -124,11 +124,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override  
        {
 #ifdef ENABLE_TENSORVIEW
+            static int c = 0; if (c++ == 0) { fprintf(stderr, "#PLUS#\n"); }
            size_t rank = DetermineElementwiseTensorRank();
            auto result =           ValueTensorFor(rank, fr);
            auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
            auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
-            result.DoSumOf(0.0f, input0, input1, 1.0f);
+            result.AssignSumOf(input0, input1);
 #else
            Matrix<ElemType> functionValues = ValueFor(fr);
            Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
@ -223,10 +224,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (Input(inputIndex)->GetNumCols() < GetNumCols())
                MaskMissingGradientColumnsToZero(fr);

-            if (sign > 0)
-                inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
-            else
-                inputGradient.DoDifferenceOf(0.0f, inputGradient, gradient, 1.0f);
+            inputGradient.AddCopyOf(gradient, sign);
 #else
            Matrix<ElemType> gradientValues = GradientFor(fr);

@ -269,12 +267,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
        {
 #ifdef ENABLE_TENSORVIEW
-            static int c = 0; if (c++ == 0) { fprintf(stderr,"#MINUS#"); }
+            static int c = 0; if (c++ == 0) { fprintf(stderr,"#MINUS#\n"); }
            size_t rank = DetermineElementwiseTensorRank();
            auto result =           ValueTensorFor(rank, fr);
            auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
            auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
-            result.DoDifferenceOf(0.0f, input0, input1, 1.0f);
+            result.AssignDifferenceOf(input0, input1);
 #else
            Matrix<ElemType> functionValues = ValueFor(fr);
            Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
@ -307,91 +305,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class MinusNode<float>; 
    template class MinusNode<double>;

-#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
-    // -----------------------------------------------------------------------
-    // ScaleNode (scalar scaling factor, matrix)
-    //
-    // Identical to ElementTimesNode with tensor lib (broadcasting). Can be removed.
-    // -----------------------------------------------------------------------
-
-    template<class ElemType>
-    class ScaleNode : public ComputationNode<ElemType>, public NumInputs<2>
-    {
-        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-        static const std::wstring TypeName() { return L"Scale"; }
-    public:
-        DeclareConstructorFromConfigWithNumInputs(ScaleNode);
-        ScaleNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            Base(deviceId, name)
-        { }
-
-        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
-        {
-#if 0//def ENABLE_TENSORVIEW    // This takes a big perf hit since our reduction uses only a single thread in this case. Needs to be fixed.
-            size_t rank = DetermineElementwiseTensorRank();
-            auto gradient = GradientTensorFor(rank, fr);
-            auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
-            auto otherInputValue = Input(1 - inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
-
-            // if reduction then mask the respective input(s) (zero out the gaps)
-            if (Input(inputIndex)->GetNumCols() < GetNumCols())
-                MaskMissingGradientColumnsToZero(fr);
-            if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
-                Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
-
-            inputGradient.DoElementwiseProductOf(1.0f/*add to*/, gradient, otherInputValue, 1.0f);
-#else
-            if (inputIndex == 0)        // left derivative
-            {
-                // this is a reduction over frames, so we must mask gaps to zero
-                Input(0)->Gradient() += Matrix<ElemType>::InnerProductOfMatrices(MaskedGradientFor(fr), Input(1)->MaskedValueFor(fr)); // element-wise product summed up over all
-            }
-            else if (inputIndex == 1)   // right derivative
-            {
-                Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
-                Matrix<ElemType>::Multiply1x1AndWeightedAdd(+1.0f, Input(0)->Value()/*1x1*/, GradientFor(fr), 1.0f, sliceInput1Grad);
-            }
-#endif
-        }
-
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The ScaleNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
-
-        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override  
-        {
-#ifdef ENABLE_TENSORVIEW
-            static int c = 0; if (c++ == 0) { fprintf(stderr, "#SCALE#"); }
-            size_t rank = DetermineElementwiseTensorRank();
-            auto result = ValueTensorFor(rank, fr);
-            auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
-            auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
-            result.DoElementwiseProductOf(0.0f, input0, input1, 1.0f);
-#else
-            ValueFor(fr).Assign1x1ProductOf(Input(0)->Value()/*1x1*/, Input(1)->ValueFor(fr));
-#endif
-        }
-
-        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
-
-            // left node must be a scalar
-            if (isFinalValidationPass && (Input(0)->GetNumRows() != 1 || Input(0)->GetNumCols() != 1))
-                RuntimeError("The left value of ScaleNode must be a scalar value.");
-
-            SetDims(Input(1));
-        }
-    };
-
-    template class ScaleNode<float>; 
-    template class ScaleNode<double>;
-#endif
-
    // -----------------------------------------------------------------------
    // NegateNode (input)
    // computes the negative of its input
@ -707,7 +620,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
                Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);

-            inputGradient.DoElementwiseProductOf(1.0f/*add to*/, gradient, otherInputValue, 1.0f);
+            inputGradient.AddElementwiseProductOf(gradient, otherInputValue);
 #else
            Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
@ -725,12 +638,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override  
        {
 #ifdef ENABLE_TENSORVIEW
-            static int c = 0; if (c++ == 0) { fprintf(stderr,"#ETIMES#"); }
+            static int c = 0; if (c++ == 0) { fprintf(stderr,"#ETIMES#\n"); }
            size_t rank = DetermineElementwiseTensorRank();
            auto result =           ValueTensorFor(rank, fr);
            auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
            auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
-            result.DoElementwiseProductOf(0.0f, input0, input1, 1.0f);
+            result.AssignElementwiseProductOf(input0, input1);
 #else
            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
            Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
@ -745,303 +658,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class ElementTimesNode<float>; 
    template class ElementTimesNode<double>;

-#if 1// change once we no longer see a perf hit to #ifndef ENABLE_TENSORVIEW
-    // -----------------------------------------------------------------------
-    // RowElementTimesNode (left, right)  --TODO: what are left and right?
-    //
-    // TODO: This is subsumed by ElementTimes with tensor lib.
-    // -----------------------------------------------------------------------
-
-    template<class ElemType>
-    class RowElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
-    {
-        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-        static const std::wstring TypeName() { return L"RowElementTimes"; }
-    public:
-        DeclareConstructorFromConfigWithNumInputs(RowElementTimesNode);
-        RowElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            Base(deviceId, name)
-        { }
-
-        void BackpropToMap(const size_t inputIndex)
-        {
-            if (inputIndex > 1)
-                InvalidArgument("RowElementTimes operation only takes two inputs.");
-
-            if (inputIndex == 0)
-            {
-                BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
-            }
-            else
-            {
-                BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
-            }
-        }
-
-        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
-        {
-            if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
-            Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
-
-            Matrix<ElemType> sliceInput1Value = Input(1 - inputIndex)->ValueFor(fr);
-
-            if (inputIndex == 0)
-            {
-                BackpropToLeftS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
-            }
-            else
-            {
-                BackpropToRightS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
-            }
-        }
-
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The RowElementTimesNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
-
-        //left (input 0) is a matrix
-        /*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
-            Matrix<ElemType>& input0GradientValues, 
-            const Matrix<ElemType>& gradientValues, 
-            Matrix<ElemType>& tempMatrix)
-        {
-            tempMatrix.SetValue(gradientValues);
-            tempMatrix.RowElementMultiplyWith(input1FunctionValues);
-            input0GradientValues += tempMatrix;
-
-#if NANCHECK
-            input0GradientValues.HasNan("RowElementTimes");
-#endif
-        }
-
-        //right (input 1) is a row vector
-        /*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues, 
-            Matrix<ElemType>& input1GradientValues, 
-            const Matrix<ElemType>& gradientValues, 
-            Matrix<ElemType>& tempMatrix)
-        {
-            tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, true);
-            input1GradientValues += tempMatrix;
-
-#if NANCHECK
-            input1GradientValues.HasNan("RowElementTimes");
-#endif
-        }
-        void ForwardPropMap()    // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
-        {
-            ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
-        }
-
-        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
-        {
-            //if (fr.IsAllFrames()) { ForwardPropMap(); return; }
-            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
-            Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
-            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
-
-            ForwardPropS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
-        }
-
-        /*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
-        {
-            functionValues.SetValue(input0);
-            functionValues.RowElementMultiplyWith(input1);
-
-#if NANCHECK
-            functionValues.HasNan("RowElementTimes");
-#endif
-        }
-
-        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
-
-            size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
-            size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); rows0;
-            if (isFinalValidationPass && cols0 != cols1 || rows1 != 1)
-                LogicError("RowElementTimes: Either the second operand is not a row vector or the number of columns of operands does not match.");
-
-            SetDims(Input(0));
-        }
-
-        //request matrices that are needed for gradient computation
-        virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
-        {
-            Base::RequestMatricesBeforeBackprop(matrixPool);
-            RequestMatrixFromPool(m_tempMatrix, matrixPool);
-        }
-
-        //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
-        virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
-        {
-            Base::ReleaseMatricesAfterBackprop(matrixPool);
-            ReleaseMatrixToPool(m_tempMatrix, matrixPool);
-        }
-
-    private:
-        shared_ptr<Matrix<ElemType>> m_tempMatrix;
-    };
-
-    template class RowElementTimesNode<float>;
-    template class RowElementTimesNode<double>;
-
-    // -----------------------------------------------------------------------
-    // ColumnElementTimesNode (left, right)  --TODO: what are left and right?
-    //
-    // TODO: This is subsumed by ElementTimes with tensor lib.
-    // -----------------------------------------------------------------------
-
-    template<class ElemType>
-    class ColumnElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
-    {
-        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-        static const std::wstring TypeName() { return L"ColumnElementTimes"; }
-    public:
-        DeclareConstructorFromConfigWithNumInputs(ColumnElementTimesNode);
-        ColumnElementTimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            Base(deviceId, name)
-        { }
-
-        void BackpropToMap(const size_t inputIndex)
-        {
-            if (inputIndex > 1)
-                InvalidArgument("ColumnElementTimes operation only takes two inputs.");
-
-            if (inputIndex == 0)
-            {
-                BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
-            }
-            else
-            {
-                BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
-            }
-        }
-
-        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
-        {
-            if (fr.IsAllFrames()) { BackpropToMap(inputIndex); return; } // TODO: remove these one by one
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
-
-            if (inputIndex == 0)
-            {
-                Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
-
-                BackpropToLeftS(Input(1)->Value(), sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
-            }
-            else
-            {
-                Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
-                BackpropToRightS(sliceInput0Value, Input(1)->Gradient(), sliceOutputGrad, *m_tempMatrix);
-            }
-        }
-
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The ColumnElementTimesNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
-
-        //left (input 0) is a matrix
-        /*TODO: merge with call site*/void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
-            Matrix<ElemType>& input0GradientValues,
-            const Matrix<ElemType>& gradientValues,
-            Matrix<ElemType>& tempMatrix)
-        {
-            tempMatrix.SetValue(gradientValues);
-            tempMatrix.ColumnElementMultiplyWith(input1FunctionValues);
-            input0GradientValues += tempMatrix;
-
-#if NANCHECK
-            input0GradientValues.HasNan("ColumnElementTimes");
-#endif
-        }
-
-        //right (input 1) is a col vector
-        /*TODO: merge with call site*/void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
-            Matrix<ElemType>& input1GradientValues,
-            const Matrix<ElemType>& gradientValues,
-            Matrix<ElemType>& tempMatrix)
-        {
-            tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, false);
-            input1GradientValues += tempMatrix;
-
-#if NANCHECK
-            input1GradientValues.HasNan("ColumnElementTimes");
-#endif
-        }
-        void ForwardPropMap()    // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
-        {
-            ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
-        }
-
-        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
-        {
-            //if (fr.IsAllFrames()) { ForwardPropMap(); return; }
-            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
-            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
-
-            ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value());
-        }
-
-        /*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
-        {
-            functionValues.SetValue(input0);
-            functionValues.ColumnElementMultiplyWith(input1);
-
-#if NANCHECK
-            functionValues.HasNan("ColumnElementTimes");
-#endif
-        }
-
-        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
-
-            //derive number of rows if possible
-            for (size_t index = 0; index < 2; index++)
-            {
-                size_t rows = Input(index)->GetNumRows() == 0 ? Input(1 - index)->GetNumRows() : Input(index)->GetNumRows();
-                size_t cols = Input(index)->GetNumCols() == 0 ? Input(1 - index)->GetNumCols() : Input(index)->GetNumCols();
-                ValidateInferInputDims(index, rows, cols);
-            }
-
-            size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
-            size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols(); cols0;
-            if (isFinalValidationPass && (rows0 != rows1 || cols1 != 1))
-                LogicError("ColumnElementTimes: Either the second operand is not a column vector or the number of rows of operands does not match.");
-
-            SetDims(Input(0));
-        }
-
-        //request matrices that are needed for gradient computation
-        virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
-        {
-            Base::RequestMatricesBeforeBackprop(matrixPool);
-            RequestMatrixFromPool(m_tempMatrix, matrixPool);
-        }
-
-        //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
-        virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
-        {
-            Base::ReleaseMatricesAfterBackprop(matrixPool);
-            ReleaseMatrixToPool(m_tempMatrix, matrixPool);
-        }
-
-    private:
-        shared_ptr<Matrix<ElemType>> m_tempMatrix;
-    };
-
-    template class ColumnElementTimesNode<float>;
-    template class ColumnElementTimesNode<double>;
-#endif
-
    // -----------------------------------------------------------------------
    // DiagTimesNode (vector representing the diagonal of a square matrix, data)
    // -----------------------------------------------------------------------
@ -1195,7 +811,6 @@ private:
        {
            Base::Validate(isFinalValidationPass);
            m_pMBLayout = nullptr;    // this node does not hold mini-batch data
-
            SetDims(TensorShape(1), 1);
        }
    };
@ -1207,6 +822,7 @@ private:
    // SumColumnElementsNode (input)
    // sums up each column of the input
    // TODO: This should be deprecated, in favor of a reduce node.
+    // TODO: Implement this with the tensor library.
    // -----------------------------------------------------------------------

    template<class ElemType>
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@ -5,6 +5,11 @@
 //
 #pragma once

+#include "Basics.h"
+#include "ComputationNode.h"
+#include "Matrix.h"
+#include "TensorView.h"
+
 #include <unordered_set>
 #include <map>
 #include <string>
@ -18,27 +23,111 @@
 #include <sstream>
 #include <iostream>

-#include "Basics.h"
-#include "Matrix.h"
-#include "ComputationNode.h"
-
 namespace Microsoft { namespace MSR { namespace CNTK {

+#ifdef ENABLE_TENSORVIEW
+
    // -----------------------------------------------------------------------
-    // NonlinearityNodeBase (input) -- abstract base class that holds what's shared
-    // between non-linearity nodes like Sigmoid
+    // UnaryElementWiseWithOpCodeNodeBase (input) -- base for elementwise unary op
+    // where forward // and backward are single ElementWiseOperator opcodes and
+    // only inputs (but not // function values) are used.
+    // -----------------------------------------------------------------------
+
+    template<class ElemType, ElementWiseOperator opForward, ElementWiseOperator opBackward, bool gradientFromOutput>
+    class UnaryElementWiseWithOpCodeNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
+    {
+        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
+    public:
+        UnaryElementWiseWithOpCodeNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
+        {
+            static int c = 0; if (c++ == 0) { fprintf(stderr, "#NLop%d#\n", (int)opForward); }
+
+            size_t rank = DetermineElementwiseTensorRank();
+            auto result =           ValueTensorFor(rank, fr);
+            auto input  = Input(0)->ValueTensorFor(rank, fr);
+            result.DoUnaryOpOf(0, input, 1, opForward);
+        }
+
+        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
+        {
+            assert(inputIndex == 0); inputIndex;
+
+            // get the args
+            size_t rank = DetermineElementwiseTensorRank();
+            auto sliceOutputGrad =              GradientTensorFor(rank, fr);    // propagate from this one...
+            auto sliceInputGrad  =    Input(0)->GradientTensorFor(rank, fr);    // ...to this one
+            auto sliceValue = gradientFromOutput ? ValueTensorFor(rank, fr) :   // using input or output value
+                                         Input(0)->ValueTensorFor(rank, fr);
+            // If gradient can be compute from output rather than input, then that's better for mem sharing (and faster in most cases).
+            // Not possible for Cos().
+            sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward);
+        }
+
+        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
+        {
+            ValidateUnaryMap(isFinalValidationPass);
+        }
+
+        // We don't need our output values in backprop.
+        virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    };
+
+#define UnaryElementWiseWithOpCodeNodeBaseMembers UsingComputationNodeMembersBoilerplate;
+
+    // -----------------------------------------------------------------------
+    // SigmoidNode (input)
+    // TanhNode (input)
+    // RectifiedLinearNode (input)
+    // LogNode (input)
+    // ExpNode (input)
+    // CosineNode (input)
+    // These are all implemented by single-opcode functions and can thus be declared by a macro.
+    // -----------------------------------------------------------------------
+
+#pragma push_macro("DeclareUnaryTensorOp")
+#define DeclareUnaryElementWiseWithOpCodeNode(Name, Forward, Backward, gradientFromOutput) \
+    template<class ElemType>                                                                             \
+    class Name ## Node : public UnaryElementWiseWithOpCodeNodeBase<ElemType, op ## Forward, op ## Backward, gradientFromOutput> \
+    { \
+        typedef UnaryElementWiseWithOpCodeNodeBase<ElemType, op ## Forward, op ## Backward, gradientFromOutput> Base; UnaryElementWiseWithOpCodeNodeBaseMembers; \
+        static const std::wstring TypeName() { return L ## #Name; } \
+    public: \
+        DeclareConstructorFromConfigWithNumInputs(Name ## Node); \
+        Name ## Node(DEVICEID_TYPE deviceId, const wstring & Name) : \
+            Base(deviceId, Name) \
+        { } \
+    }
+
+    //                                    Name             Forward and      Backward opcodes
+    DeclareUnaryElementWiseWithOpCodeNode(Sigmoid,         Sigmoid,         ElementwiseProductWithSigmoidDerivativeFromOutput,         true);
+    DeclareUnaryElementWiseWithOpCodeNode(Tanh,            Tanh,            ElementwiseProductWithTanhDerivativeFromOutput,            true);
+    DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear, LinearRectifier, ElementwiseProductWithLinearRectifierDerivativeFromOutput, true);
+    DeclareUnaryElementWiseWithOpCodeNode(Log,             Log,             ElementwiseProductWithLogDerivativeFromOutput,             true);
+    DeclareUnaryElementWiseWithOpCodeNode(Exp,             Exp,             ElementwiseProduct,                                        true);
+    DeclareUnaryElementWiseWithOpCodeNode(Cosine,          Cosine,          ElementwiseProductWithCosDerivative,                       false);
+
+#pragma pop_macro("DeclareUnaryTensorOp")
+#endif
+
+    // -----------------------------------------------------------------------
+    // SoftmaxNodeBase (input) -- shared base of Softmax and LogSoftmax
    // -----------------------------------------------------------------------

    // shared base for all element-wise non-linearities
    // What this adds over a ComputationNode<ElemType> is a member m_gradientTemp for temp use by derived classes.
+    // TODO: This was used more broadly, but no longer, so we may be able to simplify the signatures of the virtual functions.
    template<class ElemType>
-    class NonlinearityNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
+    class SoftmaxNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
    {
        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
    public:
        //virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) = 0;
-        DeclareConstructorFromConfigWithNumInputs(NonlinearityNodeBase);
-        NonlinearityNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
+        DeclareConstructorFromConfigWithNumInputs(SoftmaxNodeBase);
+        SoftmaxNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
            Base(deviceId, name)
        { }

@ -54,7 +143,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            auto sliceOutputValue = OutputUsedInComputingInputNodesGradients()  ?           ValueFor(fr) : Matrix<ElemType>();

            // do the actual operation
-            // TODO: Once all is unified then make the order of arguments more logical (in -> out)
            BackpropToV(*m_gradientTemp, sliceInputValue, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
        }

@ -80,7 +168,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Base::CopyTo(nodeP, newName, flags);
            if (flags & CopyNodeFlags::copyNodeValue)
            {
-                auto node = dynamic_pointer_cast<NonlinearityNodeBase<ElemType>>(nodeP);
+                auto node = dynamic_pointer_cast<SoftmaxNodeBase<ElemType>>(nodeP);
                *node->m_gradientTemp = *m_gradientTemp;
            }
        }
@ -102,296 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        shared_ptr<Matrix<ElemType>> m_gradientTemp;
    };

-#define UsingNonlinearityNodeBaseMembers UsingComputationNodeMembersBoilerplate; using Base::m_gradientTemp
-
-    // -----------------------------------------------------------------------
-    // RectifiedLinearNode (input) -- ReLU non-linearity
-    // -----------------------------------------------------------------------
-
-    template<class ElemType>
-    class RectifiedLinearNode : public NonlinearityNodeBase<ElemType>
-    {
-        typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
-        static const std::wstring TypeName() { return L"RectifiedLinear"; }
-    public:
-        DeclareConstructorFromConfigWithNumInputs(RectifiedLinearNode);
-        RectifiedLinearNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            NonlinearityNodeBase<ElemType>(deviceId, name)
-        { }
-
-        void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override
-        {
-            gradient.AssignLinearRectifierDerivativeOf(inputFunctionValues);
-#if DUMPOUTPUT
-            inputGradientValues.Print("RecitifiedLinearNode-Partial-in");
-#endif
-            inputGradientValues.AddElementProductOf(gradientValues, gradient);
-#if DUMPOUTPUT
-            inputGradientValues.Print("RecitifiedLinearNode-Partial-out");
-#endif
-        }
-
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The ReLU node does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
-
-        void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignTruncateBottomOf(inputFunctionValues, 0);
-#if DUMPOUTPUT
-            functionValues.Print("RectifiedLinearNode");
-#endif
-        }
-    };
-
-    template class RectifiedLinearNode<float>;
-    template class RectifiedLinearNode<double>;
-
-    // -----------------------------------------------------------------------
-    // SigmoidNode (input) -- sigmoid non-linearity
-    // -----------------------------------------------------------------------
-
-    template<class ElemType>
-    class SigmoidNode : public NonlinearityNodeBase<ElemType>
-    {
-        typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
-        static const std::wstring TypeName() { return L"Sigmoid"; }
-    public:
-        DeclareConstructorFromConfigWithNumInputs(SigmoidNode);
-        SigmoidNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            NonlinearityNodeBase<ElemType>(deviceId, name)
-        { }
-
-#ifdef ENABLE_TENSORVIEW
-        // TODO: Once tensor lib works, we will change all nodes in here to use it. Then move ForwardProp() and BackpropTo() from here into base.
-        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
-        {
-            size_t rank = DetermineElementwiseTensorRank();
-            auto result =           ValueTensorFor(rank, fr);
-            auto input  = Input(0)->ValueTensorFor(rank, fr);
-            ForwardPropV(input, result);
-        }
-
-        /*virtual*/ void ForwardPropV(const TensorView<ElemType>& input, TensorView<ElemType>& result) //override
-        {
-            result.AssignSigmoidOf(input);
-        }
-
-        virtual void /*IComputationNode::*/BeginBackprop() override             // called before first iteration step of ComputeGradient()
-        {
-            m_gradientTemp->Resize(GetNumRows(), GetNumCols());
-        }
-
-        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
-        {
-            assert(inputIndex == 0); inputIndex;
-
-            // get the args
-            // Some do not consume input and/or output values. Don't touch those, pass dummies instead, since memshare may have taken them away already.
-            size_t rank = DetermineElementwiseTensorRank();
-            auto sliceOutputGrad  =           GradientTensorFor(rank, fr);   // propagate from this one...
-            auto sliceInputGrad   = Input(0)->GradientTensorFor(rank, fr);   // ...to this one
-            auto sliceInputValue  = InputUsedInComputingInputNodesGradients(0) ? Input(0)->ValueTensorFor(rank, fr) : TensorView<ElemType>();
-            auto sliceOutputValue = OutputUsedInComputingInputNodesGradients() ?           ValueTensorFor(rank, fr) : TensorView<ElemType>();
-
-            // do the actual operation
-            // TODO: Once all is unified then make the order of arguments more logical (in -> out)
-            BackpropToV(DataTensorFor(*m_gradientTemp, rank, fr), sliceInputValue, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
-        }
-
-        /*virtual*/ void BackpropToV(TensorView<ElemType> gradient, const TensorView<ElemType>& inputFunctionValues, TensorView<ElemType> inputGradientValues, const TensorView<ElemType>& gradientValues, const TensorView<ElemType>& functionValues)
-        {
-            gradient.AssignSigmoidDerivativeOf(inputFunctionValues);
-            inputGradientValues.AddElementwiseProductOf(gradientValues, gradient);
-        }
-
-        virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-#else
-        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-        {
-            // The Sigmoid node does not require any of it's input's values for computing
-            // the gradients of its input nodes
-            UNREFERENCED_PARAMETER(childIndex);
-            return false;
-        }
-#endif
-
-        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
-        {
-            gradient.AssignSigmoidDerivativeOf(functionValues);
-            inputGradientValues.AddElementProductOf(gradientValues, gradient);
-        }
-
-        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignSigmoidOf(inputFunctionValues);
-        }
-    };
-
-    template class SigmoidNode<float>;
-    template class SigmoidNode<double>;
-
-    // -----------------------------------------------------------------------
-    // TanhNode (input) -- tanh non-linearity
-    // -----------------------------------------------------------------------
-
-    template<class ElemType>
-    class TanhNode : public NonlinearityNodeBase<ElemType>
-    {
-        typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
-        static const std::wstring TypeName() { return L"Tanh"; }
-    public:
-        DeclareConstructorFromConfigWithNumInputs(TanhNode);
-        TanhNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            NonlinearityNodeBase<ElemType>(deviceId, name)
-        { }
-
-        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-        {
-            // The plus node does not require any of it's input's values for computing
-            // the gradients of its input nodes
-            UNREFERENCED_PARAMETER(childIndex);
-            return false;
-        }
-
-        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
-        {
-            gradient.AssignElementProductOf(functionValues, functionValues); // v .* v
-            gradient.AssignDifferenceOf(1, gradient); // 1-v^2
-
-            inputGradientValues.AddElementProductOf(gradientValues, gradient); // += d .* ((1-v) .* v))
-        }
-
-        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignTanhOf(inputFunctionValues);
-        }
-    };
-
-    template class TanhNode<float>;
-    template class TanhNode<double>;
-
-    // -----------------------------------------------------------------------
-    // LogNode (input) -- component-wise log() of input
-    // -----------------------------------------------------------------------
-
-    template<class ElemType>
-    class LogNode : public NonlinearityNodeBase<ElemType>
-    {
-        typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
-        static const std::wstring TypeName() { return L"Log"; }
-    public:
-        DeclareConstructorFromConfigWithNumInputs(LogNode);
-        LogNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            NonlinearityNodeBase<ElemType>(deviceId, name)
-        { }
-
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The plus node does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
-
-        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
-        {
-            gradient.AssignElementInverseOf(inputFunctionValues); // 1/x (x is input to log(x))
-            inputGradientValues.AddElementProductOf(gradientValues, gradient);
-        }
-
-        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignLogOf(inputFunctionValues);
-        }
-    };
-
-    template class LogNode<float>;
-    template class LogNode<double>;
-
-    // -----------------------------------------------------------------------
-    // ExpNode (input) -- component-wise exp() of input
-    // -----------------------------------------------------------------------
-
-    template<class ElemType>
-    class ExpNode : public NonlinearityNodeBase<ElemType>
-    {
-        typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
-        static const std::wstring TypeName() { return L"Exp"; }
-    public:
-        DeclareConstructorFromConfigWithNumInputs(ExpNode);
-        ExpNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            NonlinearityNodeBase<ElemType>(deviceId, name)
-        { }
-
-        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
-        {
-            assert(inputIndex == 0); inputIndex;
-
-            Matrix<ElemType> sliceInputGrad = Input(0)->GradientFor(fr);
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
-            Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
-
-            m_gradientTemp->AssignExpOf(sliceInputValue); // Exp(x) is its own partial
-            sliceInputGrad.AddElementProductOf(sliceOutputGrad, *m_gradientTemp);
-        }
-
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The ExpNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
-
-        virtual void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override { NOT_IMPLEMENTED; }   // not needed
-
-        void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignExpOf(inputFunctionValues);
-        }
-    };
-
-    template class ExpNode<float>;
-    template class ExpNode<double>;
-
-    // -----------------------------------------------------------------------
-    // CosineNode (input) -- component-wise cos() of input
-    // -----------------------------------------------------------------------
-
-    template<class ElemType>
-    class CosineNode : public NonlinearityNodeBase<ElemType>
-    {
-        typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
-        static const std::wstring TypeName() { return L"Cosine"; }
-    public:
-        DeclareConstructorFromConfigWithNumInputs(CosineNode);
-        CosineNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            NonlinearityNodeBase<ElemType>(deviceId, name)
-        { }
-
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The CosineNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
-
-        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
-        {
-            gradient.AssignNegativeSineOf(inputFunctionValues); // -sin(x) (x is input to Cosine(x))
-            inputGradientValues.AddElementProductOf(gradientValues, gradient);
-        }
-
-        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignCosineOf(inputFunctionValues);
-        }
-    };
-
-    template class CosineNode<float>;
-    template class CosineNode<double>;
+#define UsingSoftmaxNodeBaseMembers UsingComputationNodeMembersBoilerplate; using Base::m_gradientTemp

    // -----------------------------------------------------------------------
    // SoftmaxNode (input) -- soft-max over input vector(s)
@ -400,14 +199,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    //we assume it's  column-wise by default
    //the derivative will increase the Matrix<ElemType> size to the power of column size and should not be used.
    template<class ElemType>
-    class SoftmaxNode : public NonlinearityNodeBase<ElemType>
+    class SoftmaxNode : public SoftmaxNodeBase<ElemType>
    {
-        typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
+        typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
        static const std::wstring TypeName() { return L"Softmax"; }
    public:
        DeclareConstructorFromConfigWithNumInputs(SoftmaxNode);
        SoftmaxNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            NonlinearityNodeBase<ElemType>(deviceId, name)
+            Base(deviceId, name)
        { }

        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
@ -467,14 +266,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // -----------------------------------------------------------------------

    template<class ElemType>
-    class LogSoftmaxNode : public NonlinearityNodeBase<ElemType>
+    class LogSoftmaxNode : public SoftmaxNodeBase<ElemType>
    {
-        typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
+        typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
        static const std::wstring TypeName() { return L"LogSoftmax"; }
    public:
        DeclareConstructorFromConfigWithNumInputs(LogSoftmaxNode);
        LogSoftmaxNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            NonlinearityNodeBase<ElemType>(deviceId, name)
+            Base(deviceId, name)
        { }

        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
@ -1040,9 +839,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // this node is not differentiable and so cannot be used in the backpropagation
    // TODO: make function value sparse?
    template<class ElemType>
-    class HardmaxNode : public NonlinearityNodeBase/*ComputationNode*/<ElemType>
+    class HardmaxNode : public SoftmaxNodeBase/*ComputationNode*/<ElemType>
    {
-        typedef NonlinearityNodeBase<ElemType> Base; UsingNonlinearityNodeBaseMembers;
+        typedef SoftmaxNodeBase<ElemType> Base; UsingSoftmaxNodeBaseMembers;
        static const std::wstring TypeName() { return L"Hardmax"; }

    public:
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@ -5,6 +5,11 @@
 //
 #pragma once

+#include "Basics.h"
+#include "Matrix.h"
+#include "TensorShape.h"
+#include "ComputationNode.h"
+
 #include <unordered_set>
 #include <map>
 #include <string>
@ -18,10 +23,6 @@
 #include <sstream>
 #include <iostream>

-#include "Basics.h"
-#include "Matrix.h"
-#include "ComputationNode.h"
-
 namespace Microsoft { namespace MSR { namespace CNTK {

    // -----------------------------------------------------------------------
@ -86,33 +87,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        typedef std::shared_ptr<DelayedValueNodeState<ElemType>> DelayedNodeStatePtr; 
        static const std::wstring TypeName() { return L"DelayedValue"; }
    private:
-        void Init(size_t row_size, size_t col_size, ElemType initialActivationValue = (ElemType)DEFAULT_HIDDEN_ACTIVATION)
+        void Init(const TensorShape & sampleLayout, ElemType initialActivationValue)
        {
            m_initialActivationValue = initialActivationValue;
            m_timeStep = 1;
            CreateMatrixIfNull(m_value);
-            SetDims(TensorShape(row_size), col_size);    // TODO: needed? Can we not infer it? How about setting a sample layout?
-            m_isHistoryCarryOverManagedExternally = false;      // used for PairNetworkNode/PastValueNode combination
+            SetDims(sampleLayout, 0);              // TODO: needed? Can we not infer it? How about setting a sample layout?
+            m_isHistoryCarryOverManagedExternally = false;      // used for PairNetworkNode/PastValueNode combination, which is deprecated
+            m_value->SetValue(m_initialActivationValue);        // is this needed?
        }
    protected:
        DelayedValueNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
            Base(deviceId, name),
            m_delayedActivation(deviceId)
        {
-            Init(1, 1);
+            Init(TensorShape(), (ElemType)DEFAULT_HIDDEN_ACTIVATION);
        }
-        DelayedValueNodeBase(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t row_size, size_t col_size, size_t timeStep) :
+        DelayedValueNodeBase(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, const TensorShape & sampleLayout, size_t timeStep) :
            Base(deviceId, name),
            m_delayedActivation(deviceId)
        {
-            Init(row_size, col_size, initialActivationValue);
-
-            m_timeStep = (int)timeStep;
-
-            m_value->SetValue(m_initialActivationValue);
+            Init(sampleLayout, initialActivationValue);
+            m_timeStep = (int)timeStep; // TODO: pass this to Init() instead as well
        }
        DelayedValueNodeBase(const ScriptableObjects::IConfigRecordPtr configp) :
-            DelayedValueNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"defaultHiddenActivation"), configp->Get(L"rows"), configp->Get(L"cols"), configp->Get(L"timeStep"))
+            DelayedValueNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"defaultHiddenActivation"), configp->Get(L"shape"), configp->Get(L"timeStep"))
        {
            // We do NOT attach the inputs, as we cannot resolve them without causing a circular reference.
            // Instead, we capture them in a lambda, which will be called by ComputationNetwork during the build process through LateAttachInputs() below.
@ -593,8 +592,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        PastValueNode(DEVICEID_TYPE deviceId, const wstring & name) :
            Base(deviceId, name)
        { }
-        PastValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t row_size, size_t col_size, size_t timeStep) :
-            Base(deviceId, name, initialActivationValue, row_size, col_size, timeStep)
+        PastValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, const TensorShape & sampleLayout, size_t timeStep) :
+            Base(deviceId, name, initialActivationValue, sampleLayout, timeStep)
+        { }
+        PastValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t numRows, size_t timeStep) :
+            PastValueNode(deviceId, name, initialActivationValue, TensorShape(numRows), timeStep)
        { }
        PastValueNode(const ScriptableObjects::IConfigRecordPtr configp) :
            Base(configp)
@ -619,8 +621,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name) :
            Base(deviceId, name)
        { }
-        FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t row_size, size_t col_size, size_t timeStep) :
-            Base(deviceId, name, initialActivationValue, row_size, col_size, timeStep)
+        FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, const TensorShape & sampleLayout, size_t timeStep) :
+            Base(deviceId, name, initialActivationValue, sampleLayout, timeStep)
+        { }
+        FutureValueNode(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, size_t numRows, size_t timeStep) :
+            FutureValueNode(deviceId, name, initialActivationValue, TensorShape(numRows), timeStep)
        { }
        FutureValueNode(const ScriptableObjects::IConfigRecordPtr configp) :
            Base(configp)
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -126,8 +126,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 #define UsingReinterpretNodeBaseMembers UsingComputationNodeMembersBoilerplate

+    // TODO: This ReshapeNode is currently not used. Its function will be taken over by Transpose and the Reshape that follows this one below.
+
    // -----------------------------------------------------------------------
-    // ReshapeNode (input) -- reinterpret input matrix as having different dimensions
+    // DeprecatedReshapeNode (input) -- reinterpret input matrix as having different dimensions
    // where the new row dimension is given, and the column dimension is inferred.
    // Also optionally associate a different TensorShape with the data.
    //
@ -149,7 +151,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    //       E.g. ReinterpretRowStackAsSequence and ReinterpretSequenceAsRowStack.
    // BUGBUG: This is not actually implemented yet. Instead, it goes from 1 to K steps or from K to 1 step. This is temporary/experimental, until the plumbing for nesting is there.
    //
-    // Thirdly, ReshapeNode can also be used to update only the TensorShape. In that case, the MBLayout is kept as is.
+    // Thirdly, DeprecatedReshapeNode can also be used to update only the TensorShape. In that case, the MBLayout is kept as is.
    //
    // Note: The new row dimension must be a straight multiple or divisor of the current row dimension.
    // To reshape to a non-multiple go to row dim 1 first.
@ -159,19 +161,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // -----------------------------------------------------------------------

    template<class ElemType>
-    class ReshapeNode : public ReinterpretNodeBase<ElemType>
+    class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
    {
        typedef ReinterpretNodeBase<ElemType> Base; UsingReinterpretNodeBaseMembers;
-        static const std::wstring TypeName() { return L"Reshape"; }
+        static const std::wstring TypeName() { return L"DeprecatedReshape"; }
    public:
-        ReshapeNode(DEVICEID_TYPE deviceId, const wstring & name, size_t numRows = 0, const TensorShape & imageLayout = TensorShape()) :
+        DeprecatedReshapeNode(DEVICEID_TYPE deviceId, const wstring & name, size_t numRows = 0, const TensorShape & imageLayout = TensorShape()) :
            Base(deviceId, name),
            m_numTargetRows(numRows),
            m_targetImageLayout(imageLayout)
        { }
-        ReshapeNode(const ScriptableObjects::IConfigRecordPtr configp) :
-            ReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"numRows"), ImageLayoutWHC(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels")))
+        DeprecatedReshapeNode(const ScriptableObjects::IConfigRecordPtr configp) :
+            DeprecatedReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"numRows"), ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKind::HWC/*legacy*/))
        {
+            // BUGBUG: We should not operate on image layouts here, but on a proper tensor layout.
            AttachInputs(configp, this->GetExpectedNumInputs());
        }

@ -180,7 +183,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Base::CopyTo(nodeP, newName, flags);
            if (flags & CopyNodeFlags::copyNodeValue)
            {
-                auto node = dynamic_pointer_cast<ReshapeNode<ElemType>>(nodeP);
+                auto node = dynamic_pointer_cast<DeprecatedReshapeNode<ElemType>>(nodeP);
                node->m_numTargetRows = m_numTargetRows;
                node->m_targetImageLayout = m_targetImageLayout;
            }
@ -197,7 +200,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            Base::Load(fstream, modelVersion);
            fstream >> m_numTargetRows;
-            m_targetImageLayout.Load(fstream);
+            m_targetImageLayout.Load(fstream, /*acceptLegacyFormat=*/true);
        }

        virtual void /*IComputationNode::*/PrintSelfBeforeValidation() const override
@ -214,7 +217,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                else
                    fprintf(stderr, "%ls[%lu, %lu]", child->NodeName().c_str(), child->GetNumRows(), child->GetNumCols());
            }
-            fprintf(stderr, ", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout.GetWidth(), m_targetImageLayout.GetHeight(), m_targetImageLayout.GetNumChannels());
+            fprintf(stderr, ", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout[1], m_targetImageLayout[2], m_targetImageLayout[0]);
+            // BUGBUG: This interpretaion as image dims is only correct for the 'legacy format, not for cudnn.
        }

        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
@ -247,7 +251,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            // setting any dimension to 0 means lose the tensor, flatten to vector
            // TODO: We can use 0 to indicate "infer". One value can be 0. It will be filled in to match row dim.
-            if (m_targetImageLayout.GetWidth() == 0 || m_targetImageLayout.GetHeight() == 0 || m_targetImageLayout.GetNumChannels() == 0)
+            if (m_targetImageLayout[1] == 0 || m_targetImageLayout[2] == 0 || m_targetImageLayout[0] == 0)
            {
                if (Input(0)->HasSampleLayout())
                    fprintf(stderr, "WARNING: Reshape operation cannot inherit image size information from its child. Image size info is lost.\n");
@ -257,7 +261,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            else
            {
                if (m_numTargetRows != m_targetImageLayout.GetNumElements())
-                    LogicError("ReshapeNode: InferTargetSampleLayout() computed a sample layout [%s] that mismatches m_numTargetRows %d.", string(m_targetImageLayout).c_str(), (int)m_numTargetRows);
+                    LogicError("DeprecatedReshapeNode: InferTargetSampleLayout() computed a sample layout [%s] that mismatches m_numTargetRows %d.", string(m_targetImageLayout).c_str(), (int)m_numTargetRows);
                SetDims(m_targetImageLayout, newCols);
            }
        }
@ -289,7 +293,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                {
                    // going from many samples to one: layout entry will get no flags
                    if (Input(0)->GetNumTimeSteps() * Input(0)->GetNumRows() / m_numTargetRows != 1)
-                        LogicError("ReshapeNode::BeginForwardProp() faking to remove a nested time dimension only works when going back to a single frame per sequence.");
+                        LogicError("DeprecatedReshapeNode::BeginForwardProp() faking to remove a nested time dimension only works when going back to a single frame per sequence.");
                    // we are in frame mode now
                    m_pMBLayout->InitAsFrameMode(Input(0)->GetNumParallelSequences());
                }
@ -297,7 +301,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                {
                    // going from one sample to many: layout will get SentenceStart/SentenceEnd flags for the sequence we expand into
                    if (Input(0)->GetMBLayout()->GetNumTimeSteps() != 1)
-                        LogicError("ReshapeNode::BeginForwardProp() faking to add a nested time dimension only works when coming from a single frame per sequence.");
+                        LogicError("DeprecatedReshapeNode::BeginForwardProp() faking to add a nested time dimension only works when coming from a single frame per sequence.");
                    m_pMBLayout->Init(Input(0)->GetNumParallelSequences(), Input(0)->GetNumTimeSteps() * Input(0)->GetNumRows() / m_numTargetRows);
                    for (size_t s = 0; s < m_pMBLayout->GetNumParallelSequences(); s++)
                        m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, s, 0, m_pMBLayout->GetNumTimeSteps());
@ -325,7 +329,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // layout case: reshape semantics happens across parallel seqeunces, i.e. requiring data shuffling
            else
            {
-                // TODO: It does not make sense to run ReshapeNode frame-by-frame inside a loop, because it changes the time base.
+                // TODO: It does not make sense to run DeprecatedReshapeNode frame-by-frame inside a loop, because it changes the time base.
                //       However, in the future, we should be able to run inside an outer loop.
                if (!fr.IsAllFrames())
                    InvalidArgument("%ls %ls operation cannot be run from inside a loop since it changes the time base.", NodeName().c_str(), OperationName().c_str());
@ -358,14 +362,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        virtual bool OutputUsedInComputingInputNodesGradients() const override
        {
-            // The ReshapeNode does not require its output value for computing
+            // The DeprecatedReshapeNode does not require its output value for computing
            // the gradients of its input nodes
            return false;
        }

        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
        {
-            // The ReshapeNode does not require any of it's input's values for computing
+            // The DeprecatedReshapeNode does not require any of it's input's values for computing
            // the gradients of its input nodes
            UNREFERENCED_PARAMETER(childIndex);
            return false;
@ -377,35 +381,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        size_t factor() const { return m_numTargetRows > Input(0)->GetNumRows() ? m_numTargetRows / Input(0)->GetNumRows() : Input(0)->GetNumRows() / m_numTargetRows; }   // factor by which we stack or unstack
        TensorShape m_targetImageLayout;

-        // this patches up m_targetImageLayout according to some rules
-        // TODO: Say in one sentence what this logic does.
+        // This infers dimensions in m_targetImageLayout.
+        // Users are allowed to provide 2 (out of 3) image dimensions.
+        // One missing dimension can be inferred. If two dimensions are
+        // unspecified it throws a runtime error.
+        // TODO: Generalize this to any number of dimensions.
        void InferTargetSampleLayout()
        {
-            if (m_targetImageLayout.GetWidth() > 0)
+            // BUGBUG: Below is the result of refactoring and only works for rank-3 tensors. Generalize.
+            if (m_targetImageLayout[1] > 0)
            {
-                if (m_targetImageLayout.GetHeight() > 0)
+                if (m_targetImageLayout[2] > 0)
                {
-                    if (m_targetImageLayout.GetNumChannels() > 0)
+                    if (m_targetImageLayout[0] > 0)
                    {
                        if (m_targetImageLayout.GetNumElements() != m_numTargetRows)
                            RuntimeError("Image dimensions do not match row size.");
                    }
                    else
                    {
-                        if (m_numTargetRows % (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetHeight()) > 0)
+                        if (m_numTargetRows % (m_targetImageLayout[1] * m_targetImageLayout[2]) > 0)
                            RuntimeError("Image row size is not a multiple of specified image dimensions.");
                        else
-                            m_targetImageLayout = ImageLayoutWHC(m_targetImageLayout.GetWidth(), m_targetImageLayout.GetHeight(), m_numTargetRows / (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetHeight()));
+                            m_targetImageLayout = TensorShape(m_numTargetRows / (m_targetImageLayout[1] * m_targetImageLayout[2]), m_targetImageLayout[1], m_targetImageLayout[2]);
                    }
                }
                else
                {
-                    if (m_targetImageLayout.GetNumChannels() > 0)
+                    if (m_targetImageLayout[0] > 0)
                    {
-                        if (m_numTargetRows % (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetNumChannels()) > 0)
+                        if (m_numTargetRows % (m_targetImageLayout[1] * m_targetImageLayout[0]) > 0)
                            RuntimeError("Image row size is not a multiple of specified image dimensions.");
                        else
-                            m_targetImageLayout = ImageLayoutWHC(m_targetImageLayout.GetWidth(), m_numTargetRows / (m_targetImageLayout.GetWidth() * m_targetImageLayout.GetNumChannels()), m_targetImageLayout.GetNumChannels());
+                            m_targetImageLayout = TensorShape(m_targetImageLayout[0], m_targetImageLayout[1], m_numTargetRows / (m_targetImageLayout[1] * m_targetImageLayout[0]));
                    }
                    else
                    {
@ -415,26 +423,173 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            else
            {
-                if (m_targetImageLayout.GetHeight() > 0)
+                if (m_targetImageLayout[2] > 0)
                {
-                    if (m_targetImageLayout.GetNumChannels() > 0)
+                    if (m_targetImageLayout[0] > 0)
                    {
-                        if (m_numTargetRows % (m_targetImageLayout.GetHeight() * m_targetImageLayout.GetNumChannels()) > 0)
+                        if (m_numTargetRows % (m_targetImageLayout[2] * m_targetImageLayout[0]) > 0)
                            RuntimeError("Image row size is not a multiple of specified image dimensions.");
                        else
-                            m_targetImageLayout = ImageLayoutWHC(m_numTargetRows / (m_targetImageLayout.GetHeight() * m_targetImageLayout.GetNumChannels()), m_targetImageLayout.GetHeight(), m_targetImageLayout.GetNumChannels());
+                            m_targetImageLayout = TensorShape(m_targetImageLayout[0], m_numTargetRows / (m_targetImageLayout[2] * m_targetImageLayout[0]), m_targetImageLayout[2]);
                    }
                    else
                        RuntimeError("At least two image dimensions must be specified.");
                }
-                else if (m_targetImageLayout.GetNumChannels() > 0)
+                else if (m_targetImageLayout[0] > 0)
                    RuntimeError("At least two image dimensions must be specified.");
                else
-                    m_targetImageLayout = ImageLayoutWHC(m_numTargetRows, 1, 1);
+                    m_targetImageLayout = TensorShape(1, m_numTargetRows, 1);
            }
        }
    };

+    template class DeprecatedReshapeNode<float>;
+    template class DeprecatedReshapeNode<double>;
+
+    // -----------------------------------------------------------------------
+    // Reshape(x, tensorShape, beginDim=0, endDim=0) -- reinterpret input samples as having different tensor dimensions
+    //  - just replaces metadata m_sampleLayout, does not change data values
+    //  - one dimension may be specified as 0 and will be inferred
+    //  - optional beginDim/endDim denote to only replace a sub-range of dims, for implementing ReshapeDimension() and FlattenRank()
+    //  - may not be applied to time; use Permute() or Transpose()
+    //
+    // Derived operations:
+    //
+    // ReshapeDimension(x, dim, tensorShape) = Reshape(x, tensorShape, beginDim=dim, endDim=dim+1)
+    //  - reinterprets one dimension as multiple, where the number of elements remains the same
+    //  - one of the new dimensions may be specified as 0 and will be inferred
+    //
+    // FlattenDimensions(x, dim, num) = Reshape(x, 0, beginDim=dim, endDim=dim+num)
+    //  - replace two or more consecutive dims by a single dim with the same number of elements
+    //
+    // SplitDimension(x, dim, N) = ReshapeDimension(x, dim, 0:N)
+    //  - splits a dimension into a new tensor dimension, injecting them into a new dimension
+    //  - to split stacked frames into a new time dimension:
+    //    insert new time dim with ReshapeDimension(., -1, 0:1), SplitDimension(., dim, N), Transpose(., dim+1, -1), then Select(., dim+1, 0) away the new time dim
+    //    This would make 4 copies presently. We may need a compound C++ node for now.
+    //  - note: to split into multiple outputs (like tf.split()), use a BrainScript loop with Slice().
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class ReshapeNode : public UnaryElementWiseNode<ElemType>
+    {
+        typedef UnaryElementWiseNode<ElemType> Base; UsingUnaryElementwiseNodeBaseMembers;
+        static const std::wstring TypeName() { return L"Reshape"; }
+    public:
+        ReshapeNode(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & replacementSampleLayout = TensorShape(), int beginDim = 1, int endDim = 0) :
+            Base(deviceId, name),
+            m_replacementSampleLayout(replacementSampleLayout), m_beginDimParameter(beginDim), m_endDimParameter(endDim)
+        { }
+        ReshapeNode(const ScriptableObjects::IConfigRecordPtr configp) :
+            ReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"shape"), configp->Get(L"beginDim"), configp->Get(L"endDim"))
+        {
+            AttachInputs(configp, this->GetExpectedNumInputs());
+        }
+
+        virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+        {
+            Base::CopyTo(nodeP, newName, flags);
+            if (flags & CopyNodeFlags::copyNodeValue)
+            {
+                auto node = dynamic_pointer_cast<ReshapeNode<ElemType>>(nodeP);
+                node->m_replacementSampleLayout = m_replacementSampleLayout;
+            }
+        }
+
+        virtual void Save(File& fstream) const override
+        {
+            Base::Save(fstream);
+            fstream << m_beginDimParameter << m_endDimParameter;
+            m_replacementSampleLayout.Save(fstream);
+        }
+
+        virtual void Load(File& fstream, size_t modelVersion) override
+        {
+            Base::Load(fstream, modelVersion);
+            fstream >> m_beginDimParameter >> m_endDimParameter;
+            m_replacementSampleLayout.Load(fstream);
+        }
+
+        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
+        {
+            Base::Validate(isFinalValidationPass);
+
+            // BUGBUG: For inputs without MBLayout, the sample layout should include the column dimension, but it does not currently. Needs to be fleshed out.
+            const auto & inputSampleLayout = Input(0)->GetSampleLayout();
+            const auto & inputDims = inputSampleLayout.GetDims();
+
+            auto replacementDims = m_replacementSampleLayout.GetDims();
+
+            size_t beginDim = m_beginDimParameter > 0 ? m_beginDimParameter - 1 : 0;
+            size_t endDim   = m_endDimParameter   > 0 ? m_endDimParameter   - 1 : inputDims.size();
+            if (!isFinalValidationPass) // non-final: be tolerant, no errors
+            {
+                if (endDim > inputDims.size())
+                    endDim = inputDims.size();
+                if (beginDim > endDim)
+                    beginDim = endDim;
+            }
+
+            // TODO: We should allow to reduce to a 0-length tensor if the dimension is 0
+
+            // if a dimension is specified as zero then infer it, otherwise verify that total #elements matches
+            size_t inputElements = 1;                       // get #elements in range to be replaced
+            for (size_t k = beginDim; k < endDim; k++)
+                inputElements *= inputDims[k];
+            size_t targetElements = 1;                      // check/infer #elements to replace with
+            size_t zeroIndex = SIZE_MAX;
+            for (size_t k = 0; k < replacementDims.size(); k++)
+            {
+                if (replacementDims[k] != 0)
+                    targetElements *= replacementDims[k];
+                else if (zeroIndex == SIZE_MAX)
+                    zeroIndex = k;
+                else
+                    InvalidArgument("%ls %ls operation: More than one dimension was specified as zero in the replacement (sub-)dimensions [%s]", NodeName().c_str(), OperationName().c_str(), string(m_replacementSampleLayout).c_str());
+            }
+            if (zeroIndex != SIZE_MAX)
+                replacementDims[zeroIndex] = inputElements / targetElements;    // infer the number (ignore errors at this point)
+
+            // assemble actual full dimension vector
+            SmallVector<size_t> dims;
+            dims.append(inputDims.begin(), inputDims.begin() + beginDim);
+            dims.append(replacementDims.begin(), replacementDims.end());
+            dims.append(inputDims.begin() + endDim, inputDims.end());
+            auto sampleLayout = TensorShape(dims);
+
+            // validate total dimension
+            if (isFinalValidationPass && inputSampleLayout.GetNumElements() != sampleLayout.GetNumElements())
+            {
+                auto subShape = TensorShape(std::vector<size_t>(inputDims.begin() + beginDim, inputDims.begin() + endDim));
+                InvalidArgument("%ls %ls operation: Input (sub-)dimensions [%s] incompatible with desired (sub-)dimensions [%s]. Number of elements %s.",
+                                NodeName().c_str(), OperationName().c_str(),
+                                string(subShape).c_str(), string(m_replacementSampleLayout).c_str(),
+                                zeroIndex == SIZE_MAX ? "must be the same" : "is not an integer multiple of the non-0 dimensions");
+            }
+
+            // that's it
+            SetDims(sampleLayout, 0);   // BUGBUG: This is incorrect if we have no MBLayout, e.g. reshaping a bias vector into a different tensor dimension
+        }
+
+        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
+        {
+            ValueFor(fr).SetValue(Input(0)->ValueFor(fr));
+        }
+
+        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
+        {
+            Input(inputIndex)->GradientFor(fr).SetValue(GradientFor(fr));
+        }
+
+        virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+        virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+
+    private:
+        TensorShape m_replacementSampleLayout;  // user-specified dimensions to replace dimensions [beginDim, endDim]
+        int m_beginDimParameter;                // 1-based index range as specified
+        int m_endDimParameter;
+    };
+
    template class ReshapeNode<float>;
    template class ReshapeNode<double>;

@ -811,4 +966,196 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class RowRepeatNode<float>;
    template class RowRepeatNode<double>;

+    /*
+
+notes on tensor operations
+==========================
+
+reshaping
+---------
+
+ - on dimension index 'dim' and 'tensorShape'
+ - tensorShape: a vector of dimensions, e.g. 640:480:3:30 could describe a 1-second RGB video of VGA dimensions at 30 fps
+ - 'dim' specifies a specific tensor index
+    - dim > 0 is a regular sample index. E.g. for a matrix, dim=1 would be the row dimension, and dim=2 in the above example has dimension 480.
+    - dim < 0 denote time indices (recurrent loops). Rank=-1 is the innermost time index.
+    - dim = 0 denotes the index of the parallel sequence
+       - Since all operations logically operate on a single sequence, i.e. parallel sequences generally cannot be indexed by the user.
+       - Exceptions: training criteria, BatchNormalization, ...WithNegativeSamples (we should not need this)
+    - I don't like that 'dim' refers to the index of the dimension as well as the number of elements in that dimension. Axis (numpy)?
+
+ - Reshaping:   --these are all implemented in C++ by DeprecatedReshapeNode
+    - Reshape(x, tensorShape, beginDim=0, endDim=0)
+        - just replaces metadata m_sampleLayout
+        - one dimension may be specified as 0 and will be inferred
+        - optional beginDim/endDim denote to only replace a sub-range of dims, for implementing ReshapeDimension() and FlattenRank()
+        - may not be applied to time; use Permute() or Transpose()
+    - ReshapeDimension(x, dim, tensorShape) = Reshape(x, tensorShape, beginDim=dim, endDim=dim+1)
+       - reinterprets one dimension as multiple, where the number of elements remains the same
+       - one of the new dimensions may be specified as 0 and will be inferred
+    - FlattenDimensions(x, dim, num) = Reshape(x, 0, beginDim=dim, endDim=dim+1)
+       - replace two or more consecutive dims by a single dim with the same number of elements
+    - SplitDimension(x, dim, N) = ReshapeDimension(x, dim, 0:N)
+       - splits a dimension into a new tensor dimension, injecting them into a new dimension
+       - to split stacked frames into a new time dimension:
+         insert new time dim with ReshapeDimension(., -1, 0:1), SplitDimension(., dim, N), Transpose(., dim+1, -1), then Select(., dim+1, 0) away the new time dim
+         This would make 4 copies presently. We may need a compound C++ node for now.
+       - note: to split into multiple outputs (like tf.split()), use a BrainScript loop with Slice().
+ - Slicing   --all implemented in C++ by SliceNode
+    - Slice(x, dim, begin, end, stride=1, phase=0)
+       - reduces a dim to index range [begin,end)
+       - negative bounds specify "from end" (end=0 means end if stride>0, and begin=0 means end if stride<0)
+       - also applies to time, e.g.:
+          - pick last frame of a sequence (for s2s): Slice(x, -1, -1, 0)    // first -1 is dim and means the time index
+          - trim first and last 3 frames of a sequence: Slice(x, -1, 3, -3) // 3 means begin at frame 3, -3 means end is 3rd frame from the end
+          - this will update MBLayout
+       - the optional stride and phase parameters are for implementing downsampling (stride>1) and reversing (begin=-1, stride=-1)
+       - multiple slice operations can be combined by concatenating the spec vector, e.g. Slice(x, dim1:dim2, begin1:begin2, end1:end2)
+       - today's RowSlice(begin, num, x) = Slice(x, 1, begin, begin + num)
+       - like torch.narrow()
+       - can implement TF unpack() and Torch split() as a BrainScript loop with multiple Slice() operations
+       - internally implemented by tensor lib opCopy with manipulated m_strides/m_offset
+    - Select(x, dim, index) = FlattenDimensions(Slice(x, dim, index, index+1), index > 1 ? index-1 : index, index > 1 ? index : index+1)
+       - narrow dim to a single index, then drop the dim. Result will have one dim less.
+       - like torch.select()
+       - can implement squeezing a dim-1 dim: Select(x, dim:0)
+    - Squeeze(x, dim) = Select(x, dim, 0)
+ - Splicing:   --all implemented in C++ by SpliceNode
+    - Splice(inputs, dim)
+       - splice multiple inputs inputs[0]:inputs[1]:... along given dim (=RowStack for vectors)
+       - inputs must have identical dimensions except for:
+          - the specified dim
+          - broadcasting dimensions (e.g. used to implement Pad())
+       - one can splice in time
+          - e.g. prepend a vector to a time sequence
+          - this will create a new MBLayout
+       - like tf.concat()
+    - Pack(inputs, dim) = ReshapeDimension(Splice(inputs, dim), dim, (0:Length(inputs)) )
+       - like splice but creates inserts new dim of dimension Length(inputs)
+       - inputs must have identical dimensions for all dims (except for broadcasting)
+       - dim can be a time dimension; then a new inner-most time dimension will be inserted
+       - like tf.pack()
+    - Pad(x, dim, howManyBefore, howManyAfter, with=0) = Splice(Constant(with, tensorShape=1*(dim-1):howManyBefore),  x,  Constant(with, tensorShape=1*(dim-1):howManyAfter), dim)
+       - inverse of slice, pad with a constant value
+       - dimensions specified relative, can pad at start and end
+       - in time: pad neighbor frames
+    - Repeat(x, dim, numRepeats) = Splice(x*numRepeats, dim)
+       - generalizes CNTK RowRepeat(x, numRepeats) = Repeat(x, 1, numRepeats)
+       - to repeat multiple, specify vectors, e.g. Repeat(x, dim1:dim2, numRepeats1:numRepeats2)
+       - like tf.tile() and Matlab's repmat()
+ - Transposition (permuting dims):   --implemented in C++ by PermuteDimensionsNode
+    - PermuteDimensionsOf(x, dim1:dim2:...:dimN)
+       - dims are rotated to dim2:dim3:...:dimN:dim1; other dims remain untouched
+         To rotate the other way round, specify them in opposite order.
+         We specify it this way to be able to reference the time dimension without having to know the rank of the m_sampleLayout.
+       - time dims must have a constant duration for all items in the minibatch
+       - internally implemented with tensor lib by shuffling dimensions with their strides  --TODO: check if TensorShape optimization is still correct
+    - Transpose(x, dim1, dim2) = PermuteDimensions(x, dim1:dim2)
+       - any two dimensions; including time (must have constant duration)
+       - like torch.transpose()
+ - Re-indexing:   --implemented by ReindexRankNode and SliceNode
+    - ReindexDimension(x, dim, indexVector)
+       - splice x[..., indexVector[0], ...], x[..., indexVector[1], ...], etc. with indexVector[.] at given dim
+       - indexVector must be invertible if it is intended to backpropagate through this node
+    - DownsampleDimension(x, dim, n, phase=0) = Slice(x, dim, 0, 0, stride=n)
+       - select every n-th element, starting with index 'phase'
+       - time dims allowed. Phase is then a modulus w.r.t. where a sequence is inside the minibatch (may require a ReconcileLayout() before to match layouts)
+    - ReverseDimension(x, dim) = Slice(x, dim, -1, 0, stride=-1)
+       - reverses the direction of a dim
+       - when applied to time dims, this creates a new layout (which is also flipped)
+
+ - misc.:
+    - note: much would look more natural if we had OO syntax, e.g. x.Slice(dim, begin, end).FlattenDimensions(...)
+      Could be done by exposing all methods on ComputationNode... not currently feasible with BrainScript, but e.g. with Python bindings
+    - torch.unfold (dim, size, step)
+       - create a convolution matrix (stride magic)
+    - CyclicallyPermuteRank(x, dim, step)
+       - rotates indices
+       - also applies to time dimensions
+    - duplicate elements
+    - Gather
+       - from Torch and TF
+    - TF also has:
+       - 'gather': reindexing
+       - 'dynamic_partition', 'dynamic_stitch'
+    - Torch:
+       - expand (dim, range): broadcasts dimension 'dim' as a new dimension with 'range'. Not needed I think.
+       - repeatTensor: like tile but with weird reshaping
+       - squeeze: removes all singleton dimensions, or a specific one. We can remove a specific one with Select().
+    - TODO:
+       - give names to dimensions?
+       - do we want to allow time offsets in layouts?
+
+reductions
+----------
+
+ - ReduceSum
+    - sum over all elements of a dimension, or over time
+ - ReduceMax
+    - max
+ - ReduceMean
+    - av
+ - ArgMax, ArgMin
+    - we already have that somewhere, for evaluation
+ - All, Any
+    - logical test --must be done over sequences
+ - TF also has:
+    - reduce_prod, reduce_min
+    - segment_sum etc.; we use sequences
+    - listdiff
+    - where: indices of 'true' values  -> 2D tensor of coordinates
+    - unique (1D only)
+    - edit_distance
+    - invert_permutation: invert a permutation index vector
+    - top_k
+
+convolutions
+------------
+
+ - convolution
+    - convolution with filter
+    - max pool (=convolution with weights 1 and max reduction)
+    - av pool (=convolution with uniform filter)
+ - also in time: by specifying more filter dimensions [TODO]
+    - tricky bit: boundaries; may need expansion or reduction of sequences
+
+element-wise operations
+-----------------------
+
+ - PlusNode, MinusNode, ElementTimes
+ - with broadcasting, these implement:
+    - PlusNode with bias, PlusNode for images
+    - 1-x
+    - ScaleNode, RowElementTimes, ColumnElementTimes
+ - elementwise nonlinearities as usual  [TODO: complete them]
+ - logical ops (can be done by comparison ops actually)
+ - Clamp
+    - bounds are passed as 'Const'
+ - TF: in_top_k
+ - Torch performs these ops (e.g. add) as vector, without broadcasting
+    - e.g. max reduces, while cmax does not. Our solution is better... really? How to specify reduce?
+
+gradient operations
+-------------------
+
+ - TF: are nodes, e.g. clip_by_value
+    - input should be parameters as well, so they can be computed
+ - need a node to stop gradient propagation?
+ - can we use nodes to specify things like AdaGrad and momentum?
+
+debugging
+---------
+
+ - node that prints activations
+ - node that prints mean/var of gradients
+
+other
+-----
+
+ - per-node learning rate: can specify additional parameter for each node? Maybe fold with updateLearnableParameter?
+ - give dimensions a name?
+ - can we interleave variable-length ones? Concat into a single dimensions, using strides?
+
+ */
+
 }}}
--- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h
+++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
@ -1367,6 +1367,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            RequestMatrixFromPool(m_softmaxOfRight, matrixPool);
            RequestMatrixFromPool(m_gammaFromLattice, matrixPool);
        }
+
+        // Release gradient and temp matrices that are no longer needed after all the children's gradients are computed.
+        virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+        {
+            Base::ReleaseMatricesAfterBackprop(matrixPool);
+            ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
+            ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
+            ReleaseMatrixToPool(m_gammaFromLattice, matrixPool);
+        }
+
        // TODO: method names should be CamelCase
        std::vector<shared_ptr<const msra::dbn::latticepair>> * getLatticePtr()
        {
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -39,7 +39,7 @@

 MATH_API DEVICEID_TYPE EnforceOneGPUOnly(DEVICEID_TYPE requestedDeviceId);

-namespace Microsoft { namespace MSR { namespace CNTK {    
+namespace Microsoft { namespace MSR { namespace CNTK {

    // -----------------------------------------------------------------------
    // ElementWiseOperator -- This enum represents which function to apply.
@ -48,41 +48,52 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    enum ElementWiseOperator
    {
+        // nullary
+        opConstOne,
        // unary (or binary with constant parameter)
        opCopy,
        opNegate, opNot,
        opAbs,
-        opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine,
-        // these are not implemented yet:
-        opSaturateBetaAlpha, opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha,
+        opSigmoid, opTanh, opSqrt, opExp, opLog, opLinearRectifier, opCosine,
+        // unary ops for use by Matrix class only (there is no TensorView implementation)
+        opSigmoidDerivative, opLinearRectifierDerivative, opNegativeSine,
        // binary
        opSum, opDifference, opElementwiseProduct, opElementwiseQuotient,
        opLogSum, opMax, opMin,
        opEQ, opNE, opGT, opLT, opGE, opLE,
+        opAnd, opOr, opXor,
        opMaskNegative,
+        opElementwiseProductWithSigmoidDerivativeFromOutput, opElementwiseProductWithTanhDerivativeFromOutput,
+        opElementwiseProductWithLinearRectifierDerivativeFromOutput, opElementwiseProductWithLogDerivativeFromOutput, opElementwiseProductWithCosDerivative,
+        // binary ops for indexing
+        //opIndex,
        // ternary
-        opCond
-        // Note: not all of the above are actually implement at present; and not all that's implemented has an opcode.
+        opCond/*a ? b : c*/, opClip/*clip a within interval b..c*/
+        // Note: not all that's implemented in CNTK ComputationNodes has an opcode yet.
    };

    // helper to apply a C macro for all operations of each kind
+#define ForAllNullaryOps(Macro) \
+    Macro(ConstOne);
+
 #define ForAllUnaryOps(Macro) \
    Macro(Copy); \
    Macro(Negate); Macro(Not); \
    Macro(Abs); \
-    Macro(Sigmoid); Macro(SigmoidDerivative); Macro(Tanh); Macro(Sqrt); Macro(Exp); Macro(Log); Macro(LinearRectifierDerivative); Macro(Cosine); Macro(NegativeSine);
-
-#define ForAllParameterizedUnaryOps(Macro) \
-    Macro(SaturateBetaAlpha); Macro(SumAlpha); Macro(SubDifferenceToAlpha); Macro(SubDifferenceFromAlpha);
+    Macro(Sigmoid); Macro(Tanh); Macro(Sqrt); Macro(Exp); Macro(Log); Macro(LinearRectifier); Macro(Cosine);

 #define ForAllBinaryOps(Macro) \
    Macro(Sum); Macro(Difference); Macro(ElementwiseProduct); Macro(ElementwiseQuotient); \
    Macro(LogSum); Macro(Max); Macro(Min); \
    Macro(EQ); Macro(NE); Macro(GT); Macro(LT); Macro(GE); Macro(LE); \
-    Macro(MaskNegative);
+    Macro(And); Macro(Or); Macro(Xor);\
+    Macro(MaskNegative); \
+    Macro(ElementwiseProductWithSigmoidDerivativeFromOutput); Macro(ElementwiseProductWithTanhDerivativeFromOutput); \
+    Macro(ElementwiseProductWithLinearRectifierDerivativeFromOutput); Macro(ElementwiseProductWithLogDerivativeFromOutput); Macro(ElementwiseProductWithCosDerivative); \
+    //Macro(Index);

 #define ForAllTernaryOps(Macro) \
-    Macro(Cond);
+    Macro(Cond); Macro(Clip);

    // -----------------------------------------------------------------------
    // various enums to describe 
--- a/Source/Math/ConvolutionEngine.cpp
+++ b/Source/Math/ConvolutionEngine.cpp
@ -51,6 +51,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // GPU and 1-dimensional image
            bool gpuSparse1D = (inT.h() == 1 &&
                in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
+                convDesc.wStride() == 1 &&
+                !convDesc.padding() &&
                in.GetMatrixType() == MatrixType::SPARSE);

            out.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);
@ -67,8 +69,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t startSampleId = i * subBatchSize;
                size_t endSampleId = min(batchSize, startSampleId + subBatchSize);
                size_t smallBatchSize = endSampleId - startSampleId;
-
-                workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
                Mat inputSubBatch;

                // We optimize for three different scenarios here by handling them slightly differently.
@ -78,10 +78,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (in.GetMatrixType() == MatrixType::DENSE)
                    inputSubBatch = in.ColumnSlice(startSampleId, smallBatchSize);
                else
-                {
                    inputSubBatch.SetValue(in.ColumnSlice(startSampleId, smallBatchSize), in.GetFormat());
-                    inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, true);
-                }

                if (gpuSparse1D)
                {
@ -94,6 +91,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                }
                else
                {
+                    inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, true);
                    workspace.AssignPackedConvolutionInput(inputSubBatch,
                        inT.w(), inT.h(), inT.c(),
                        outT.w(), outT.h(), outT.c(),
@ -101,6 +99,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        convDesc.padding());

                    Mat outputSubBatch = out.ColumnSlice(outputSizePerChannel * startSampleId, outputSizePerChannel * smallBatchSize);
+
+                    workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
                    Mat::Multiply(filter, false, workspace, false, outputSubBatch);
                }
            }
@ -197,6 +197,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // GPU and 1-dimensional image
            bool gpuSparse1D = (inT.h() == 1 &&
                in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
+                convDesc.wStride() == 1 &&
+                !convDesc.padding() &&
                in.GetMatrixType() == MatrixType::SPARSE);

            if (numSubBatches == 1 && allowReuse && !gpuSparse1D)  //reuse packed input from evaluation step if it's not changed by either subbatch or recurrent steps.
@ -209,18 +211,40 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    size_t startSampleID = i * subBatchSize;
                    size_t endSampleID = min(batchSize, startSampleID + subBatchSize);
                    size_t smallBatchSize = endSampleID - startSampleID;
-
-                    workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
-                    Matrix<ElemType> inputSubBatch = in.ColumnSlice(startSampleID, smallBatchSize);
-                    inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, inputSubBatch.GetFormat(), true);
-                    workspace.AssignPackedConvolutionInput(inputSubBatch,
-                        inT.w(), inT.h(), inT.c(),
-                        srcGradT.w(), srcGradT.h(), srcGradT.c(),
-                        filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
-                        convDesc.padding());
-
                    Matrix<ElemType> outputGradientSubBatch = srcGradTmp.ColumnSlice(startSampleID * outputSizePerChannel, smallBatchSize * outputSizePerChannel);
-                    Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, filter);
+
+                    // We optimize for three different scenarios here by handling them slightly differently.
+                    // [Scenario 1] Dense: Unroll using AssignPackedConvolutionInput and multiply.
+                    // [Scenario 2] Sparse 1-D convolution on GPU: for text scenarios we have a specific kernel.
+                    // [Scenario 3] Sparse all others: convert to dense. Temporary work-around - allocating/de-allocating memory is costly!
+                    if (gpuSparse1D)
+                    {
+                        Matrix<ElemType> inputSubBatch;
+                        inputSubBatch.SetValue(in.ColumnSlice(startSampleID, smallBatchSize));
+                        inputSubBatch.Reshape(inT.c(), smallBatchSize * inT.w());
+                        Matrix<ElemType> inputSubBatchSparseReordered(inputSubBatch.GetNumCols(), inputSubBatch.GetNumRows(), inputSubBatch.GetDeviceId(), MatrixType::SPARSE, MatrixFormat::matrixFormatSparseCSC);
+                        Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, inT.w(), 1, smallBatchSize, inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);
+
+                        Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * srcGradT.w(), srcGradT.c(), outputGradientSubBatch.GetDeviceId());
+                        Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, srcGradT.w(), 1, smallBatchSize, srcGradT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);
+
+                        filter.Reshape(srcGradT.c() * filterT.w(), inT.c());
+                        Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, filter, smallBatchSize, convDesc.wStride(), convDesc.padding(), false);
+                        filter.Reshape(srcGradT.c(), inT.c() * filterT.w());
+                    }
+                    else
+                    {
+                        workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
+                        Matrix<ElemType> inputSubBatch = in.ColumnSlice(startSampleID, smallBatchSize);
+                        inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, inputSubBatch.GetFormat(), true);
+                        workspace.AssignPackedConvolutionInput(inputSubBatch,
+                            inT.w(), inT.h(), inT.c(),
+                            srcGradT.w(), srcGradT.h(), srcGradT.c(),
+                            filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
+                            convDesc.padding());
+
+                        Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, filter);
+                    }
                }
            }

@ -239,7 +263,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
            assert(outT.n() == out.GetNumCols());

-            Mat o = out.ColumnSlice(0, out.GetNumCols());
+            Mat o = out.ColumnSlice(0, out.GetNumCols());   // same as .AsReference()
            Mat d = dst.Reshaped(biasT.c(), outT.w() * outT.h() * outT.n());
            d.AssignSumOf(o.Reshaped(biasT.c(), outT.w() * outT.h() * outT.n()), bias);
        }
@ -410,23 +434,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    };

    template<class ElemType>
-    std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, EngineType engType)
+    std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind)
    {
        if (engType == EngineType::Auto)
        {
            // REVIEW alexeyk: make cuDNN default when running on GPU and compiled with cuDNN, add config parameter to enable runtime switch between implementations.
-            if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId))
-                return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>();
-            return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>();
+            if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId) && imageLayoutKind == ImageLayoutKind::CHW)
+                return Create(deviceId, EngineType::CuDnn, imageLayoutKind);
+            else
+                return Create(deviceId, EngineType::Legacy, imageLayoutKind);
        }
        else if (engType == EngineType::CuDnn)
        {
+            if (imageLayoutKind != ImageLayoutKind::CHW)
+                InvalidArgument("ConvolutionEngineFactory: ImageLayout '%s' is not compatible with the cuDNN engine.", ToString(imageLayoutKind).c_str());
            if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId))
                return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>();
            RuntimeError("cuDNN convolution engine is not supported, check the device id and whether the code was compiled with cuDNN.");
        }
        else if (engType == EngineType::Legacy)
+        {
+            if (imageLayoutKind != ImageLayoutKind::HWC)
+                InvalidArgument("ConvolutionEngineFactory: ImageLayout '%s' is not compatible with the legacy convolution engine.", ToString(imageLayoutKind).c_str());
            return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>();
+        }

        RuntimeError("Not supported convolution engine type: %d.", engType);
    }
--- a/Source/Math/ConvolutionEngine.h
+++ b/Source/Math/ConvolutionEngine.h
@ -18,6 +18,7 @@
 #endif

 #include "Matrix.h"
+#include "TensorShape.h" // for ImageLayoutKind

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -252,7 +253,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId) = 0;

        enum class EngineType { Auto, CuDnn, Legacy };
-        static std::unique_ptr<ConvolutionEngineFactory<ElemType>> Create(DEVICEID_TYPE deviceId, EngineType engType = EngineType::Auto);
+        static std::unique_ptr<ConvolutionEngineFactory<ElemType>> Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind);

    public:
        ConvolutionEngineFactory(const ConvolutionEngineFactory&) = delete;
--- a/Source/Math/CuDnnConvolutionEngine.cpp
+++ b/Source/Math/CuDnnConvolutionEngine.cpp
@ -10,11 +10,7 @@
 #ifdef USE_CUDNN
 #include <cudnn.h>

-template<> const char* CudaErrString(cudnnStatus_t x)
-{
-    return cudnnGetErrorString(x);
-}
-#define CUDNN_CALL(expr)     (CudaCall((expr), #expr, "cuDNN", CUDNN_STATUS_SUCCESS))
+template<> const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x) { return cudnnGetErrorString(x); }

 // A note on the formats: CNTK originally used NHWC for input/output tensors and CHWN for filters.
 // Such formats have very limited support in cuDNN and not used in other frameworks.
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -5,25 +5,27 @@
 //

 #include "stdafx.h"
+#include "Basics.h"
 #include "BestGpu.h"
 #include "DebugUtil.h"

 #ifndef CPUONLY

-#include "cublas_v2.h"
-#include "Basics.h"
 #include "GPUMatrix.h"
 #include "GPUMatrixCUDAKernels.cuh"
 #include "GPUSparseMatrix.h"
+#include "GPUTensor.h"
 #include "CommonMatrix.h"
 #define TENSOR_OPS_DECL __device__ __host__
 #include "TensorOps.h"
 #include "device_launch_parameters.h"
-#include <assert.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <curand.h>
 #include <curand_kernel.h>
+#include "cublas_v2.h"
+#include <assert.h>
+#include <memory>

 #pragma comment (lib, "cudart.lib")     // instruct linker to reference these libs
 #pragma comment (lib, "cublas.lib")
@ -47,8 +49,6 @@ bool do_sync = true;
 #ifdef _WIN32
 // thread local storage to access the current stream, initalize to default stream
 __declspec (thread)
-#else
-static
 #endif
 cudaStream_t t_stream = cudaStreamDefault;

@ -78,9 +78,9 @@ cudaStream_t MATH_API GetStream()
    performElementWiseFunction(ElementWiseOperator::op##f, a.m_pArray); \
    return *this; }

-static const char * CudaErrString(cudaError_t x)  { cudaDeviceSynchronize(); return cudaGetErrorString(x); }
-static const char * CudaErrString(cublasStatus_t) { cudaDeviceSynchronize(); return "(see cublas_api.h & look for cublasStatus_t or CUBLAS_STATUS_xxx)"; }
-static const char * CudaErrString(curandStatus)   { cudaDeviceSynchronize(); return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)"; }
+template<> const char * CudaErrString<cudaError_t>(cudaError_t x)     { cudaDeviceSynchronize(); return cudaGetErrorString(x); }
+template<> const char * CudaErrString<cublasStatus_t>(cublasStatus_t) { cudaDeviceSynchronize(); return "(see cublas_api.h & look for cublasStatus_t or CUBLAS_STATUS_xxx)"; }
+template<> const char * CudaErrString<curandStatus>(curandStatus)     { cudaDeviceSynchronize(); return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)"; }

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -384,7 +384,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 #pragma region Constructors and Destructor

-   //should only be used by constructors.
+    // should only be used by constructors
    template<class ElemType>
    void GPUMatrix<ElemType>::ZeroInit(int deviceId)
    {
@ -449,13 +449,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_numRows = moveFrom.m_numRows;
        m_numCols = moveFrom.m_numCols;
        m_computeDevice = moveFrom.m_computeDevice;
-        m_pArray = moveFrom.m_pArray;  //shallow copy the pointer       
+        m_pArray = moveFrom.m_pArray;  // shallow copy the pointer       
        m_matrixName=moveFrom.m_matrixName;
        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
        m_format = moveFrom.m_format;
        m_externalBuffer = moveFrom.m_externalBuffer;

-        //release the pointer from the source object so that the destructor won't release it twice
+        // release the pointer from the source object so that the destructor won't release it twice
        moveFrom.ZeroInit(0);       
    }

@ -477,10 +477,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        if (this != &moveFrom)
        {
-            if (OwnBuffer() && m_pArray!=NULL)
-            {
+            if (OwnBuffer() && m_pArray)
                CUDA_CALL(cudaFree(m_pArray));  
-            }

            m_numRows = moveFrom.m_numRows;
            m_numCols = moveFrom.m_numCols;
@ -500,8 +498,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    GPUMatrix<ElemType>::~GPUMatrix(void)
    {
        Clear();
-        if (m_workspace != nullptr)
-            delete m_workspace;
+        delete m_workspace;
    }

    template<class ElemType>
@ -3259,6 +3256,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #pragma endregion Other helper functions

 #pragma region Static BLAS Functions
+    // float/double overloads of cublasSgemm()/cublasDgemm()
+    static cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc)
+    {
+        return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+    }
+    static cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc)
+    {
+        return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+    }
+
    template<class ElemType>
    void GPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, 
        ElemType beta, GPUMatrix<ElemType>& c)
@ -3278,28 +3285,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (beta == 0)
            c.Resize(m,n);
        else
-            c.VerifySize(m, n); // Can't resize if beta != 0
+            c.VerifySize(m, n);     // Can't resize if beta != 0

        if (!(m>0 && k>0 && l>0 && n>0)) 
-        {
            RuntimeError("!(m>0 && k>0 && l>0 && n>0)");  //converting from size_t to int may cause overflow
-        }
        if (k!=l) 
-        {
            RuntimeError("matrix dim mismatch in MultiplyAndWeightedAdd");
-        }
-        if (sizeof(ElemType)==sizeof(float))
-        {
-            CUBLAS_CALL(cublasSgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast<float*>(&alpha),reinterpret_cast<float*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<float*>(b.m_pArray),(int)b.m_numRows,reinterpret_cast<float*>(&beta),reinterpret_cast<float*>(c.m_pArray),(int)c.m_numRows));
-        }
-        else if (sizeof(ElemType)==sizeof(double))
-        {            
-            CUBLAS_CALL(cublasDgemm(cuHandle,transA,transB,m,n,k,reinterpret_cast<double*>(&alpha),reinterpret_cast<double*>(a.m_pArray),(int)a.m_numRows,reinterpret_cast<double*>(b.m_pArray),(int)b.m_numRows,reinterpret_cast<double*>(&beta),reinterpret_cast<double*>(c.m_pArray),(int)c.m_numRows));
-        }
-        else 
-        {
-            RuntimeError("Unsupported template argument in GPUMatrix");             
-        }
+        CUBLAS_CALL(cublas_gemm(cuHandle, transA, transB, m, n, k, &alpha, a.m_pArray, (int)a.m_numRows, b.m_pArray, (int)b.m_numRows, &beta, c.m_pArray, (int)c.m_numRows));
        c.m_numRows=m;
        c.m_numCols=n;
    }
@ -4436,396 +4428,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CUDA_CALL(cudaFree(d_zeta));
    };

-    // =======================================================================
-    // TensorView support
-    // =======================================================================
-
-    // BUGBUG: This is a stub that currently is just the CPU code. This is not functional yet.
-
-    // To save time, this makes extensive use of templates and macros.
-
    // -----------------------------------------------------------------------
-    // simple fixed-size arrays for passing dimension information by value
-    // since CUDA can't just take our std::array and std::vector
-    // -----------------------------------------------------------------------
-
-    template<typename T, size_t N>
-    struct FixedArray
-    {
-        T m_data[N];
-        __device__ __host__ size_t size() const { return N; }
-        __device__ __host__ T & operator[](size_t n)       { return m_data[n]; }
-        __device__ __host__ T   operator[](size_t n) const { return m_data[n]; }
-        template<class VEC> FixedArray(const VEC & data)    // construct from CPU-side STL array or vector
-        {
-            assert(data.size() == N);
-            for (size_t n = 0; n < N; n++)
-            {
-                m_data[n] = (T)data[n];
-                if (m_data[n] != data[n])   // overflow check
-                    InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
-            }
-        }
-    };
-    template<typename T>        // specialized version for 0 elements
-    struct FixedArray<T, 0>
-    {
-        __device__ __host__ size_t size() const { return 0; }
-        template<class VEC> FixedArray(const VEC & data) { assert(data.size() == 0); UNUSED(data); }
-    };
-
-    template<typename T, size_t N, size_t K>    // N = which input/output; K = index depth
-    struct FixedMatrix
-    {
-        T m_data[N][K];
-        __device__ __host__ size_t getNumRows() const { return N; }
-        __device__ __host__ size_t getNumCols() const { return K; }
-        __device__ __host__ T & operator()(size_t n, size_t k)       { return m_data[n][k]; }
-        __device__ __host__ T   operator()(size_t n, size_t k) const { return m_data[n][k]; }
-        template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data)  // construct from CPU-side array of vectors
-        {
-            assert(data.size() == N);
-            for (size_t n = 0; n < N; n++)
-            {
-                assert(data[n].size() == K);
-                for (size_t k = 0; k < K; k++)
-                {
-                    m_data[n][k] = (T)data[n][k];
-                    if (m_data[n][k] != data[n][k])   // overflow check
-                        InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
-                }
-            }
-        }
-    };
-    template<typename T, size_t N>        // specialized version for 0 elements
-    struct FixedMatrix<T, N, 0>
-    {
-        __device__ __host__ size_t getNumRows() const { return N; }
-        __device__ __host__ size_t getNumCols() const { return 0; }
-        template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data) { assert(data.size() == N); for (size_t n = 0; n < N; n++) assert(data[n].size() == 0); UNUSED(data); }
-    };
-
-    // -----------------------------------------------------------------------
-    // function to actually compute a function of (N-1) inputs based on the opcode
+    // TensorView entry points from Matrix.cpp
    // -----------------------------------------------------------------------

+    // helper to provide a vector of ones of at least the given number of elements
+    // TODO: Use this to implement ComputationNode::ConstOnes? Or do we even need that anymore?
    template<class ElemType>
-    struct TensorOps
+    static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE deviceId)
    {
-        static __device__ ElemType Compute(const FixedArray<ElemType*, 2> & pointers, ElementWiseOperator op)
+        // using an array of shared_ptrs because those are thread-safe. The objects themselves are immutable.
+        // And using a plain array so this will never get freed, avoiding free-after-DLL-unload issues.
+        static shared_ptr<GPUMatrix<ElemType>> onesCache[32];   // cache of objects
+        if (deviceId >= _countof(onesCache))
+            LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", (int)_countof(onesCache), (int)deviceId+1);
+        auto p = onesCache[deviceId];
+        if (!p || p->GetNumRows() < N)  // must (re-)allocate
        {
-            ElemType a = *(pointers[0]);
-#define CaseUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a)
-            switch (op)
-            {
-            ForAllUnaryOps(CaseUnaryTensorOp);
-            default: return 0;  // (failure)
-            }
+            p = make_shared<GPUMatrix<ElemType>>(GPUMatrix<ElemType>::Ones(N, 1, deviceId));
+            onesCache[deviceId] = p;    // this will replace the pointer thread-safely (although weird race conditions may happen where a larger entry is overwritten by a smaller one; will still run correctly)
        }
-        static __device__ ElemType Compute(const FixedArray<ElemType*, 3> & pointers, ElementWiseOperator op)
-        {
-            ElemType a = *(pointers[0]);
-            ElemType b = *(pointers[1]);
-#define CaseBinaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b)
-            switch (op)
-            {
-            ForAllBinaryOps(CaseBinaryTensorOp);    // note: this costs about 6% compared to having only a single case
-            default: return 0;  // (failure)
-            }
-        }
-        static __device__ ElemType Compute(const FixedArray<ElemType*, 4> & pointers, ElementWiseOperator op)
-        {
-            ElemType a = *(pointers[0]);
-            ElemType b = *(pointers[1]);
-            ElemType c = *(pointers[2]);
-#define CaseTernaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b,c)
-            switch (op)
-            {
-            ForAllTernaryOps(CaseTernaryTensorOp);
-            default: return 0;  // (failure)
-            }
-        }
-    };
-
-    // -----------------------------------------------------------------------
-    // function to compute the value for a given output location (perform reduction if needed)
-    // -----------------------------------------------------------------------
-
-#define C_size_t       CUDA_LONG
-#define C_int          CUDA_LONG
-#define C_unsigned_int CUDA_LONG
-
-    template<class ElemType, C_size_t N, C_int M, C_int m>
-    struct TensorOpReduce
-    {
-        // this version for m >= 0
-        static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
-                                           const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
-        {
-            // start with index 0
-            // Using 'double' since we are memory-bound anyway.
-            double/*ElemType*/ aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
-            // apply this index to the pointers
-            C_size_t dim = reducingOpDims[m];
-            for (C_size_t k = 1/*done with k=0 already*/; k < dim; k++)
-            {
-                // bump the pointers
-                for (C_size_t i = 0; i < N; i++)
-                    pointers[i] += reducingStrides(i,(C_size_t)m);
-                ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
-                aggregate += val;
-            }
-            return (ElemType)aggregate;
-        }
-    };
-
-    // this one terminates the template recursion over reduction dimensions
-    // The pointers are pointing to the input element.
-    template<class ElemType, C_size_t N, C_int M>
-    struct TensorOpReduce<ElemType, N, M, /*m=*/-1>
-    {
-        // this version for m = -1
-        // the pointers are pointing to the right location(s) to take the operation over
-        static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
-                                           const FixedArray<C_unsigned_int, M> & /*reducingOpDims*/, const FixedMatrix<C_int, N, M> & /*reducingStrides*/)
-        {
-            return TensorOps<ElemType>::Compute(pointers, op);   // finally computing something!
-        }
-    };
-
-    // -----------------------------------------------------------------------
-    // perform loop over regular index k for N-nary operations (N counting the output)
-    // -----------------------------------------------------------------------
-
-    // The canonical case, vector op without reduction, is this PTX function:
-    // _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi3ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
-    //                                   float ^      ^ aggregate loop
-    //                                      args? ^       ^ input dims
-    // _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi2ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
-
-    // increment a pointer by a number of elements
-    // This will later change into pre-scaled strides.
-    template<class ElemType>
-    static __device__ void IncPtr(ElemType * &p, C_int index, C_int stride)
-    {
-        //p = (ElemType*)(byteOffset + (char *)p);
-        p = p + index * stride;
+        return p;
    }

-    // The 'pointers' only refer to a single element, so we will bump them in-place to perform indexing.
-    template<class ElemType, C_size_t N, C_int M, C_int K, C_int k>
-    struct TensorOpElement
-    {
-        // template-recursive version loops over indices
-        static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
-                                       const FixedArray<C_unsigned_int, K> & regularOpStrides,  const FixedMatrix<C_int, N, K> & regularStrides,
-                                       const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
-        {
-            // map id (location on grid) to index[k]
-            C_size_t stride = regularOpStrides[(C_size_t)k];
-            C_size_t index = id / stride;             // this dimension
-            id = id % stride;                       // remaining dimensions inside this
-            // apply this index to the pointers
-            for (C_size_t i = 0; i < N; i++)
-                pointers[i] += index * regularStrides(i,(C_size_t)k);    // now this dimension is taken care of
-            // process the previous index
-            TensorOpElement<ElemType, N, M, K, k - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides);
-        }
-    };
-
-    // specialization for k=0 where stride is guaranteed to be 1
-    template<class ElemType, C_size_t N, C_int M, C_int K>
-    struct TensorOpElement<ElemType, N, M, K, /*k=*/0>
-    {
-        // template-recursive version loops over indices
-        static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
-                                       const FixedArray<C_unsigned_int, K> & regularOpStrides,  const FixedMatrix<C_int, N, K> & regularStrides,
-                                       const FixedArray<C_unsigned_int, M> & reducingOpDims,    const FixedMatrix<C_int, N, M> & reducingStrides)
-        {
-            // map id (location on grid) to index[k]
-            C_size_t index = id;                      // this dimension
-            // apply this index to the pointers
-            for (C_size_t i = 0; i < N; i++)
-                pointers[i] += index * regularStrides(i,0);    // now this dimension is taken care of
-            // process the previous index
-            TensorOpElement<ElemType, N, M, K, -1>::Compute(/*id*/0, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides);
-        }
-    };
-
-    // specialization for k = -1 terminates the template recursion
-    template<class ElemType, C_size_t N, C_int M, C_int K>
-    struct TensorOpElement<ElemType, N, M, K, /*k=*/-1>
-    {
-        // template-recursion-teminating version computes the actual value for this output location
-        // now the pointers point to the right element
-        static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
-                                       const FixedArray<C_unsigned_int, K> & /*regularOpStrides*/, const FixedMatrix<C_int, N, K> & /*regularStrides*/,
-                                       const FixedArray<C_unsigned_int, M> & reducingOpDims,       const FixedMatrix<C_int, N, M> & reducingStrides)
-        {
-            // compute the operation for this output coordinate
-            // This may still involve a reduction over inverse-broadcasting dimensions.
-            ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
-            // scale
-            val *= alpha;
-            // combine with previous value in target matrix, then write it out
-            auto * pout = pointers[N - 1];
-            if (beta != 0)
-                val += beta * *pout;
-            // save
-            *pout = val;
-        }
-    };
-
-    // -----------------------------------------------------------------------
-    // kernel and launch
-    // -----------------------------------------------------------------------
-
-    // the top-level kernel
-    template<class ElemType, C_size_t N, C_int M, C_int K>
-    __global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
-                                    FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides,
-                                    FixedArray<C_unsigned_int, M> reducingOpDims,   FixedMatrix<C_int, N, M> reducingStrides, CUDA_LONG numElements)
-    {
-        CUDA_LONG id = GridDim::GetLinearThreadId();
-        if (id >= numElements)
-            return;
-        TensorOpElement<ElemType, N, M, K, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides);
-    }
-
-    // launch tensor op with CUDA
-    // All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
-    template<class ElemType, C_size_t N, C_int M, C_int K>
-    static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
-                               const SmallVector<size_t> & regularOpDims,       const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
-                               const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
-    {
-        // copy all parameters to CUDA-compatible data structures
-        FixedArray<ElemType*, N> pointers(pointerVector);
-        SmallVector<C_size_t> regularOpStrideVector;    // kernel needs the strides for converting thread index back to multi-dimensional tensor index
-        C_size_t numElements = 1;
-        for (C_size_t k = 0; k < regularOpDims.size(); k++)
-        {
-            regularOpStrideVector.push_back(numElements);
-            numElements *= (C_size_t)regularOpDims[k];
-        }
-        FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
-        FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
-        FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
-        FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
-        
-        CUDA_LONG NN = (CUDA_LONG)numElements;
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        GridDim grid(NN);
-        _launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, NN);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-    // for linear unary ops, we need to define a functor for every function for use as a template parameter (lambda syntax doesn't work in CUDA 7)
-    #define DefineUnaryTensorFunctor(oper) \
-        struct Functor ## oper { template<class ElemType> static __device__ ElemType f(ElemType a) { return Op ## oper(a); } };
-    ForAllUnaryOps(DefineUnaryTensorFunctor);
-
-    // the top-level kernel for linear unary ops
-    // Note: If we have a beta, we have 2 memory accesses, so this optimization may no longer be needed as we are memory-bound.
-    template<class ElemType, class FN>
-    __global__ void _launchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, CUDA_LONG numElements)
-    {
-        CUDA_LONG id = GridDim::GetLinearThreadId();
-        if (id >= numElements)
-            return;
-        ElemType a = pa[id];
-        ElemType val = FN::f(a);
-        val *= alpha;
-        if (beta != 0)
-            val += beta * pb[id];
-        pb[id] = val;
-    }
-    // version without beta and alpha
-    template<class ElemType, class FN>
-    __global__ void _launchUnaryTensorOp(const ElemType * pa, ElemType * pb, CUDA_LONG numElements)
-    {
-        CUDA_LONG id = GridDim::GetLinearThreadId();
-        if (id >= numElements)
-            return;
-        ElemType a = pa[id];
-        ElemType val = FN::f(a);
-        pb[id] = val;
-    }
-
-    // special case of linear unary operation
-    template<class ElemType>
-    static void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim)
-    {
-        CUDA_LONG NN = (CUDA_LONG)regularOpDim;
-
-        #define CaseLaunchUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: \
-            if (beta == 0 && alpha == 1) \
-                return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(pa, pb, NN); \
-            else \
-                return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pa, pb, alpha, NN);
-
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        GridDim grid(NN);
-        switch (op)
-        {
-        ForAllUnaryOps(CaseLaunchUnaryTensorOp);
-        default: LogicError("LaunchTensorOp1: Unknown op code %d.", (int)op);
-        }
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-    }
-
-    // -----------------------------------------------------------------------
-    // map runtime parameters N to template parameters
-    // -----------------------------------------------------------------------
-
-    // tensor operation with k+1 dimensions (-1 means scalar)
-    template<class ElemType, C_size_t N, C_int K>
-    static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
-                                        const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, N> & regularStrides,
-                                        const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
-    {
-        size_t dims = reducingOpDims.size();
-        switch (dims)
-        {
-        case 2: return LaunchTensorOp<ElemType, N, 2, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 1: return LaunchTensorOp<ElemType, N, 1, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 0: return LaunchTensorOp<ElemType, N, 0, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (C_int)dims);
-        }
-    }
-
-    // tensor operation, generalized in number of arguments
-    // This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
-    template<class ElemType, C_size_t N>
-    static void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
-                               const array<size_t, N> & offsets,
-                               const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, N> & regularStrides,
-                               const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
-    {
-        for (C_size_t i = 0; i < N; i++)  // N = a small constant, this will be unrolled
-            pointers[i] += offsets[i];
-        size_t dims = regularOpDims.size();
-        switch (dims)
-        {
-        case 4: return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 3: return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 2: return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 1: return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 0: return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (C_int)dims);
-        }
-    }
-
-    // -----------------------------------------------------------------------
-    // entry points from Matrix.cpp
-    // -----------------------------------------------------------------------
-
    // perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
    // This binds the N-ariness to a template parameter N, and gets the data pointers out from the matrix objects.
    template<class ElemType>
@ -4844,6 +4469,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0)
            return LaunchUnaryTensorOp<ElemType>(beta, a.m_pArray + offsets[0], m_pArray + offsets[1], alpha, op, regularOpDims[0]);

+        // special case: recuding a matrix onto a column vector; can be done with SGEMM
+        // Note: A minor risk is that with this, our own reduction function will rarely be used.
+        // That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
+        else if (op == ElementWiseOperator::opCopy &&                                                       // we are just adding to target without any further operation
+#ifdef _DEBUG
+                 sizeof(ElemType) == sizeof(float) &&                                                       // in debug don't shortcut 'double' so we have some test of our own codepath
+#endif
+                 regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 &&     // we are processing a column
+                 reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t)regularOpDims[0])        // reducing across columns and no overlap
+        {
+            assert(reducingStrides[1][0] == 0);
+            auto ARows = regularOpDims[0];              // vertical steps
+            auto ACols = reducingOpDims[0];             // horizontal steps (reduction)
+            auto ALd   = reducingStrides[0][0];         // horizontal step width through matrix
+            cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
+            CUBLAS_CALL(cublas_gemm(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, (int)/*CRows=*/ARows, /*CCols=*/1, (int)ACols, &alpha,
+                                     /*A00=*/a.m_pArray + offsets[0], (int)ALd,
+                                     /*B00=*/GetOnesVector<ElemType>(ACols, a.GetComputeDeviceId())->m_pArray, (int)/*BRows=*/ACols, &beta,
+                                     /*C00=*/m_pArray + offsets[1], (int)/*CRows=*/ARows));
+            return;
+        }
+
+        // TODO: Add a special case for tensor bias reduction. cudnn is ~7% faster on Image/QuickE2E.
+
        // regular case
        else
            return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2> { a.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
@ -4859,6 +4508,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        a.PrepareDevice();
        if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId())
            InvalidArgument("All matrices must be on the same GPU");
+
        return TensorOpN<ElemType, 3>(beta, array<ElemType*, 3> { a.m_pArray, b.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }

@ -4875,7 +4525,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return TensorOpN<ElemType, 4>(beta, array<ElemType*, 4> { a.m_pArray, b.m_pArray, c.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }

-
    // =======================================================================
    // explicit instantiations business
    // =======================================================================
@ -4886,10 +4535,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class DeviceBoundNumber<double>;

    template<class ElemType>
-    cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus]={0};
+    cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus] = { 0 };

    template<class ElemType>
-    void* GPUMatrix<ElemType>::s_curandGenerator=NULL;    
+    void* GPUMatrix<ElemType>::s_curandGenerator = NULL;

    // We use Matrix<char> as the backing store for QuantizedMatrix
    // Let's explicitly instantiate the methods we need for that purpose
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -9,7 +9,7 @@
 #include "File.h"
 #include "Helpers.h"
 #include "CommonMatrix.h"
-#include "DataTensor.h" // only for SmallVector; I was hoping to keep this out
+#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
 #include "DebugUtil.h"
 #include "BestGpu.h"    // for CPUONLY macro
 #include "ConcStack.h"
@ -47,9 +47,7 @@ typedef struct CUstream_st *cudaStream_t;
 void MATH_API SetStream(cudaStream_t stream);
 cudaStream_t MATH_API GetStream();

-namespace Microsoft {
-    namespace MSR {
-        namespace CNTK {
+namespace Microsoft { namespace MSR { namespace CNTK {

    // -----------------------------------------------------------------------
    // DeviceBoundNumber -- This class represents a number which resides on a particular device. Use it to avoid unnecessary transfers between CPU and GPU
@ -506,7 +504,7 @@ namespace Microsoft {
 }}}

 // Error handling
-template<typename ERRTYPE> static const char * CudaErrString(ERRTYPE x);
+template<typename ERRTYPE> const char * CudaErrString(ERRTYPE x);   // actual error function is defined inside .cu files
 template<typename ERRTYPE> static void CudaCall(ERRTYPE retCode, const char * exprString, const char * libName, ERRTYPE successCode)
 {
    if (retCode != successCode)
@ -523,7 +521,9 @@ template<typename ERRTYPE> static void CudaCall(ERRTYPE retCode, const char * ex
        }
    }
 }
+
 #define CUDA_CALL(expr)     (CudaCall((expr), #expr, "CUDA", cudaSuccess))
 #define CUBLAS_CALL(expr)   (CudaCall((expr), #expr, "CUBLAS", CUBLAS_STATUS_SUCCESS))
 #define CUSPARSE_CALL(expr) (CudaCall((expr), #expr, "CUSPARSE", CUSPARSE_STATUS_SUCCESS))
 #define CURAND_CALL(expr)   (CudaCall((expr), #expr, "CURAND", CURAND_STATUS_SUCCESS))
+#define CUDNN_CALL(expr)    (CudaCall((expr), #expr, "cuDNN", CUDNN_STATUS_SUCCESS))
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -4,15 +4,22 @@
 // </copyright>
 //

+#pragma once
+
 #include "BestGpu.h"

 #ifndef CPUONLY

-#include <float.h>
-#include <cuda_runtime.h>
+#pragma push_macro("TENSOR_OPS_DECL")
+#define TENSOR_OPS_DECL __device__ __host__
 #include "CommonMatrix.h"
+#include "GPUMatrix.h"
+#include "TensorOps.h"  // for exp_() etc.
 #include "device_functions.h"
+#include <cuda_runtime.h>
 #include <assert.h>
+#include <float.h>
+#pragma pop_macro("TENSOR_OPS_DECL")

 // REVIEW alexeyk: disable warnings properly for GCC/clang
 #ifdef _MSC_VER
@ -36,38 +43,116 @@

 #define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing

+// CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
+static __inline__ __device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+
+// TODO: replace this with TensorOps.h LogAdd(). It differs in using ElemType throughout, while this one seems to use 'double' versions of exp() and log().
+// The 'k' in the name is to avoid naming conflicts with various versions of logadd() that are defined throughout the codebase.
+template<class ElemType>
+static inline __device__ __host__ ElemType logaddk(ElemType x, ElemType y)
+{
+    ElemType temp, diff, z;
+
+    if (x < y)
+    {
+        temp = x; x = y; y = temp;
+    }
+    diff = y - x;
+    if (diff < MINLOGEXP)
+    {
+        return (x < LSMALL) ? LZERO : x;
+    }
+    else
+    {
+        z = exp(diff);
+        return x + log(1.0 + z);
+    }
+}
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
 // ---------------------------------------------------------------------------
 // GridDim -- helper to choose the CUDA grid dimensions
 // ---------------------------------------------------------------------------

-// TODO: move the computation of 'id' here as well
+template<class INT, class INT2>
+static INT CeilDiv(INT a, INT2 b)   // ceil(a/b)
+{
+    return (INT)(((size_t)a + (size_t)b - 1) / (size_t)b);  // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[])
+}
+
 struct GridDim
 {
    static const CUDA_LONG maxThreadsPerBlock = 512;    // use this many threads per block
-    static const CUDA_LONG minBlocksPerGrid = 48;       // use at least that many blocks  --TODO: base this on actual hardware
+    static const CUDA_LONG maxWarpsPerBlock = 16;       // use this many warps per block

    // use these for launching
    //   GridDim grid(NN);
    //   kernel<<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, ...>>>(...)
    int m_blocksPerGrid, m_threadsPerBlock;             // (these may in the future be extended to multi-dimensional ones)
+    CUDA_LONG m_N;

    GridDim(CUDA_LONG N)    // linear grid
    {
+        m_N = N;
        if (N == 0)                     // CUDA will fail to launch with 0 blocks
            N = 1;
-        m_threadsPerBlock = GridDim::maxThreadsPerBlock;
-        m_blocksPerGrid = (N + m_threadsPerBlock - 1) / m_threadsPerBlock;
-        if (m_blocksPerGrid < minBlocksPerGrid)
+
+        // get device information
+        const auto & props = GetDeviceProps();
+        CUDA_LONG numProcs = props.multiProcessorCount;
+        CUDA_LONG warpSize = props.warpSize;
+
+        // distribute warps evenly over processors
+        CUDA_LONG warpsPerProc = CeilDiv(N, numProcs * warpSize);
+
+        // if too many warps per block then reduce #warps
+        if (warpsPerProc > maxWarpsPerBlock)
        {
-            // we cannot fill all blocks -> use less threads
-            m_threadsPerBlock = (N + minBlocksPerGrid - 1) / minBlocksPerGrid;
-            // round to multiples of 32 (warp size) for efficient memory access
-            m_threadsPerBlock = (m_threadsPerBlock + 31) / 32 * 32;
-            m_blocksPerGrid = (N + m_threadsPerBlock - 1) / m_threadsPerBlock;
+            CUDA_LONG overBy = CeilDiv(warpsPerProc, maxWarpsPerBlock); // we are over by this factor
+            warpsPerProc = CeilDiv(warpsPerProc, overBy);
        }
+
+        // put it back together
+        m_threadsPerBlock = warpsPerProc * warpSize;
+        m_blocksPerGrid = CeilDiv(N, m_threadsPerBlock);
+        if (m_blocksPerGrid == 1)
+            m_threadsPerBlock = N;  // don't launch more than necessary  --TODO: Does this make a difference at all?
        assert(m_blocksPerGrid * m_threadsPerBlock >= N);
    }

+    static std::vector<cudaDeviceProp> CacheDeviceProps()
+    {
+        int numDevices;
+        CUDA_CALL(cudaGetDeviceCount(&numDevices));
+        std::vector<cudaDeviceProp> props(numDevices);
+        for (int i = 0; i < numDevices; i++)
+            CUDA_CALL(cudaGetDeviceProperties(&props[i], i));
+#if 1   // on Linux, maxGridSize[0] gets reported as 0
+        for (int i = 0; i < numDevices; i++)
+            fprintf(stderr, "%d procs  %d warps  %d %d %d max grid  on  %s\n", (int)props[i].multiProcessorCount, (int)props[i].warpSize, (int)props[i].maxGridSize[0], (int)props[i].maxGridSize[1], (int)props[i].maxGridSize[2], props[i].name);
+#endif
+        return props;
+    }
+
+    // get device properties of current device
+    static const cudaDeviceProp & GetDeviceProps()
+    {
+        static std::vector<cudaDeviceProp> props = CacheDeviceProps();   // thread-safe according to C++ standard
+        int deviceId;
+        cudaGetDevice(&deviceId);
+        return props[deviceId];
+    }
+
    // compute our location on the grid
    static __device__ CUDA_LONG GetLinearThreadId()
    {
@ -83,9 +168,6 @@ struct GridDim
 #define UNUSED_FUNCTION_ATTRIBUTE
 #endif

-// Predefine this for later.
-static __inline__ __device__ double atomicAdd(double* address, double val) UNUSED_FUNCTION_ATTRIBUTE;
-
 // ===========================================================================
 // CUDA kernels follow, lots of them
 // ===========================================================================
@ -97,18 +179,6 @@ static __inline__ __device__ double atomicAdd(double* address, double val) UNUSE
 // (ElemenType *res, CUDA_LONG N), a pointer and length of the output block. Each thread computes a function
 // of the inputs for one value in the output.

-// This macro overloads _x() with float and double arguments, and inlines the correct library function. This simplifies templated kernel code.
-// TODO: merge with similar definition in TensorOps.h
-#define DEF_ELEMENT_PRIMITIVE(x) __device__ __forceinline__ float _##x(float f) { return x##f(f); } __device__ __forceinline__ double _##x(double f) { return x(f); }
-
-DEF_ELEMENT_PRIMITIVE(exp)
-DEF_ELEMENT_PRIMITIVE(log)
-DEF_ELEMENT_PRIMITIVE(tanh)
-DEF_ELEMENT_PRIMITIVE(sqrt)
-DEF_ELEMENT_PRIMITIVE(fabs)
-DEF_ELEMENT_PRIMITIVE(cos)
-DEF_ELEMENT_PRIMITIVE(sin)
-
 template<class ElemType>
 __global__ void _elementWisePowerOnCuda(
    const ElemType alpha,     
@ -147,6 +217,7 @@ __global__ void _elementWisePowerOnCuda(
 };

 // Note that this code is inefficient on CUDA due to diverging code paths.
+// Use Sigmoid() in TensorOps.h instead, which solves this problem.
 template<class ElemType>
 __global__ void _elementWiseSigmoidOnCuda(    
    const ElemType *a,
@ -159,12 +230,12 @@ __global__ void _elementWiseSigmoidOnCuda(
 #else
    if (a[id] >= 0)
    {
-        ElemType e = _exp(-a[id]);
+        ElemType e = exp_(-a[id]);
        res[id] = 1 / (1 + e);
    }
    else
    {
-        ElemType e = _exp(a[id]);
+        ElemType e = exp_(a[id]);
        res[id] = e / (1 + e);
    }
 #endif
@ -186,7 +257,7 @@ __global__ void _assignSigmoidOf(
    res[id] = Microsoft::MSR::CNTK::Sigmoid(a[id]);
 #else
    ElemType negElem = -a[id];
-    ElemType e = _exp(negElem);
+    ElemType e = exp_(negElem);

    res[id] = 1 / (e + 1);
 #endif
@ -219,7 +290,7 @@ __global__ void _elementWiseTanhOnCuda(
    const CUDA_LONG N)
 {
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    res[id] = _tanh(a[id]);
+    res[id] = tanh_(a[id]);
 };

 //to prevent negative values caused by floating operations, we force inputs to be >=0
@ -231,7 +302,7 @@ __global__ void _elementWiseSqrtOnCuda(
    const CUDA_LONG N)
 {
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    res[id] = _sqrt(max((ElemType)0, a[id]));
+    res[id] = sqrt_(max((ElemType)0, a[id]));
 };

 template<class ElemType>
@ -241,7 +312,7 @@ __global__ void _elementWiseExpOnCuda(
    const CUDA_LONG N)
 {
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    res[id] = _exp(a[id]);
+    res[id] = exp_(a[id]);
 };

 template<class ElemType>
@ -251,7 +322,7 @@ __global__ void _elementWiseLogOnCuda(
    const CUDA_LONG N)
 {
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    res[id] = (a[id] < EPS_IN_LOG) ? LOG_OF_EPS_IN_LOG : _log(a[id]);
+    res[id] = (a[id] < EPS_IN_LOG) ? LOG_OF_EPS_IN_LOG : log_(a[id]);
 };

 template<class ElemType>
@ -261,7 +332,7 @@ __global__ void _elementWiseAbsOnCuda(
    const CUDA_LONG N)
 {
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    res[id] = _fabs(a[id]);
+    res[id] = fabs_(a[id]);
 };

 template<class ElemType>
@ -271,7 +342,7 @@ __global__ void _elementWiseCosineOnCuda(
    const CUDA_LONG N)
 {
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    res[id] = _cos(a[id]);
+    res[id] = cos_(a[id]);
 };

 template<class ElemType>
@ -281,7 +352,7 @@ __global__ void _elementWiseNegativeSineOnCuda(
    const CUDA_LONG N)
 {
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    res[id] = -_sin(a[id]);
+    res[id] = -sin_(a[id]);
 };

 template<class ElemType>
@ -1210,42 +1281,60 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
    ElemType* cnzValues,  //target nz values
    GPUSPARSE_INDEX_TYPE* cRowIndex,
    GPUSPARSE_INDEX_TYPE* cColCSCIndex,
-    size_t D, size_t S, size_t M, size_t K, size_t T)
+    size_t D, size_t S, size_t M, size_t K, size_t T,
+    size_t nz)
 {
-    CUDA_LONG col = blockDim.x * blockIdx.x + threadIdx.x;   // input tensor of dimension (D x S x M x K x T)
-    if (col >= T)
+    CUDA_LONG N = blockDim.x * blockIdx.x + threadIdx.x;   // input tensor of dimension (D x S x M x K x T)
+    if (N >= nz || N < aColCSCIndex[0])
        return;

-    size_t N = D * S * M * K;
+    size_t col;
+    for (col = 0; col < T; col++)
+    {
+        if (aColCSCIndex[col + 1] > N)
+            break;
+    }
+
+    size_t na = aRowIndex[N];
    int start = aColCSCIndex[col];
    int end = aColCSCIndex[col + 1];
-    int current = start;

-    for (size_t nc = 0; nc < N; nc++)
+    // recover the 5 indices from the loop counter
+    size_t d = (na                  ) % D;
+    size_t s = (na / D              ) % S;
+    size_t m = (na / D / S          ) % M;
+    size_t k = (na / D / S / M      ) % K;
+
+    // compute index for the a and b/c tensors
+    size_t nc = ((s * M + m) * K + k) * D + d;    // output tensor of dimension (D x K x M x S): k/K and s/S swapped
+
+    int rowIdx = start;
+    for (size_t na_i = start; na_i < end; na_i++)
    {
        // recover the 5 indices from the loop counter
-        size_t d = (nc                  ) % D;
-        size_t s = (nc / D              ) % S;
-        size_t m = (nc / D / S          ) % M;
-        size_t k = (nc / D / S / M      ) % K;
+        size_t d_i = (na_i              ) % D;
+        size_t s_i = (na_i / D          ) % S;
+        size_t m_i = (na_i / D / S      ) % M;
+        size_t k_i = (na_i / D / S / M  ) % K;

        // compute index for the a and b/c tensors
-        size_t na = ((s * M + m) * K + k) * D + d;    // output tensor of dimension (D x K x M x S): k/K and s/S swapped
-
-        for (size_t j = start; j < end; j++)
+        size_t nc_i = ((s_i * M + m_i) * K + k_i) * D + d_i;    // output tensor of dimension (D x K x M x S): k/K and s/S swapped
+        if (nc_i < nc)
        {
-            if (aRowIndex[j] == na)
-            {
-                cnzValues[current] = anzValues[j];
-                cRowIndex[current] = nc;
-                current++;
-                break;
-            }
+            rowIdx++;
        }
    }

-    cColCSCIndex[col] = start;
-    cColCSCIndex[col + 1] = end;
+    cnzValues[rowIdx] = anzValues[N];
+    cRowIndex[rowIdx] = nc;
+
+    if (N == nz - 1)
+    {
+        for (int i = 0; i <= T; i++)
+        {
+            cColCSCIndex[i] = aColCSCIndex[i];
+        }
+    }
 }

 template<class ElemType>
@ -2688,25 +2777,82 @@ __global__ void _sparseCSRElemMulDense(
    }
 }

+template<class ElemType>
+__global__ void _isValid(
+    const GPUSPARSE_INDEX_TYPE* rowIndex,
+    const GPUSPARSE_INDEX_TYPE* colCSCIndex,
+    const int rows,
+    const int cols,
+    const int nz,
+    long* d_res
+    )
+{
+    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= cols)
+        return;
+
+    int start = colCSCIndex[id];
+    int end = colCSCIndex[id + 1];
+    d_res[0] = 1;
+
+    if (start > end)
+    {
+        d_res[0] = -1;
+        d_res[1] = start;
+        d_res[2] = end;
+    }
+    else if (end > nz)
+    {
+        d_res[0] = -2;
+        d_res[1] = end;
+        d_res[2] = nz;
+    }
+    else
+    {
+        for (int j = start; j < end; j++)  //j points to the value
+        {
+            if (rowIndex[j] > rows)
+            {
+                d_res[0] = -3;
+                d_res[1] = rowIndex[j];
+                d_res[2] = rows;
+                break;
+            }
+        }
+    }
+}
+
+template<class ElemType>
+__global__ void _shiftColCSCIndexFromSliceViewToAbsolute(
+    GPUSPARSE_INDEX_TYPE* colCSCIndex,
+    const int cols
+    )
+{
+    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= cols)
+        return;
+
+    colCSCIndex[id] = colCSCIndex[id] - colCSCIndex[0];
+}

 //c = alpha * op(a) * op(b) + beta*c
 // TODO: This function can be further improved by loading the kernel in shared memory
 template<class ElemType>
 __global__ void _dense1DConvMultSparseCSCAndWeightedAddToDense(
-    int m,                  // rowDense
-    int k,                  // colDense
-    int n,                  // colSparse
-    int numChannels,        // input num channels
-    int numSteps,           // convolution num steps
-    int horizontalSubsample,// convolution step size
-    bool channelwise,       // pixelwise for normal multiplication and channelwise for convolution operation
-    ElemType alpha,
+    const int m,                  // rowDense
+    const int k,                  // colDense
+    const int n,                  // colSparse
+    const int numChannels,        // input num channels
+    const int numSteps,           // convolution num steps
+    const int horizontalSubsample,// convolution step size
+    const bool channelwise,       // pixelwise for normal multiplication and channelwise for convolution operation
+    const ElemType alpha,
    const ElemType* a,      //dense
-    bool transposeA,
+    const bool transposeA,
    const ElemType* bnzValues,  //sparse nz values
    const GPUSPARSE_INDEX_TYPE* rowIndex,
    const GPUSPARSE_INDEX_TYPE* colCSCIndex,
-    ElemType beta,
+    const ElemType beta,
    ElemType* c  //dense target
    )
 {
@ -2828,15 +2974,15 @@ __global__ void _reshape(

    int currentCol = id;
    int oldColLower = (newNumRows * currentCol) / oldNumRows;
-    int oldColUpper = (newNumRows * (currentCol + 1)) / oldNumRows;

    // initialize to the end and then scan in the right direction in the for-loop
    int currentColStart = oldColumnIndex[oldNumCols];

-    for (int oldCol = oldColLower; oldCol <= min(oldColUpper, oldNumCols); oldCol++)
+    for (int oldCol = oldColLower; oldCol <= oldNumCols; oldCol++)
    {
        int start = oldColumnIndex[oldCol];
        int end = (oldCol < oldNumCols) ? oldColumnIndex[oldCol + 1] : oldColumnIndex[oldNumCols] + 1;
+        bool done = false;

        for (int j = start; j < end; j++)  //j points to the value
        {
@ -2845,11 +2991,21 @@ __global__ void _reshape(
            int newCol = index / newNumRows;
            int newRow = index % newNumRows;

-            newRowIndex[j] = newRow;
+            if (newCol == currentCol)
+                newRowIndex[j] = newRow;

            if (newCol >= currentCol && currentColStart > j)
                currentColStart = j;
+
+            if (newCol > currentCol)
+            {
+                done = true;
+                break;
+            }
        }
+
+        if (done)
+            break;
    }

    newColumnIndex[currentCol] = currentColStart;
@ -3423,7 +3579,7 @@ __global__ void _assignNoiseContrastiveEstimation(
        if (positive)
            prob = -prob;
        ElemType score_noise = log_num_noise_samples + prob;
-        ElemType z = logadd(tmp[i], score_noise);
+        ElemType z = logaddk(tmp[i], score_noise);
        ElemType logprob = tmp[i] - z;
        ElemType logprob_noise = score_noise - z;
        tmp[i] = -exp(logprob);
@ -3715,40 +3871,6 @@ __global__ void _normalGradForSparseBlock(
    lhsValues[index] = rhs[IDX2C(row, col, numRows)];
 }

-static __inline__ __device__ double atomicAdd(double* address, double val)
-{
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
-}
-
-template<class ElemType>
-static __inline__ __device__ ElemType logadd(ElemType x, ElemType y)
-{
-    ElemType temp, diff, z; 
-
-    if (x < y) 
-    {
-        temp = x; x = y; y = temp;
-    }
-    diff = y - x; 
-    if (diff < MINLOGEXP)
-    {
-        return (x < LSMALL)?LZERO:x;
-    }
-    else
-    {
-        z = exp(diff);
-        return x + log(1.0 + z);
-    }
-}
-
 //This function should be called with 1024 threads per block and 1 block
 //THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
 template<class ElemType>
@ -4513,7 +4635,7 @@ __global__ void _rcrfBackwardCompute(
            fSum = LZERO;
            for (int j = 0; j < iNumLab; j++)
            {
-                fSum = logadd(fSum, alpha[IDX2C(j, t, iNumLab)]);
+                fSum = logaddk(fSum, alpha[IDX2C(j, t, iNumLab)]);
            }

            fTmp = alpha[IDX2C(id, t, iNumLab)] - fSum;
@ -4525,10 +4647,10 @@ __global__ void _rcrfBackwardCompute(
                fSum = LZERO;
                for (int m = 0; m < iNumLab; m++)
                {
-                    fSum = logadd(fSum, alpha[IDX2C(m, t, iNumLab)] + pair_scores[IDX2C(j, m, iNumLab)]);
+                    fSum = logaddk(fSum, alpha[IDX2C(m, t, iNumLab)] + pair_scores[IDX2C(j, m, iNumLab)]);
                }

-                fTmp = logadd(fTmp, beta[IDX2C(j, t + 1, iNumLab)] + alpha[IDX2C(id, t, iNumLab)] + pair_scores[IDX2C(j, id, iNumLab)] - fSum);
+                fTmp = logaddk(fTmp, beta[IDX2C(j, t + 1, iNumLab)] + alpha[IDX2C(id, t, iNumLab)] + pair_scores[IDX2C(j, id, iNumLab)] - fSum);
            }
        }

@ -4589,7 +4711,7 @@ __global__ void _rcrfBackwardCompute(
    {
        for (int j = 0; j < iNumLab; j++)
        {
-            fTmp = logadd(fTmp, beta_t1[j] + alpha[id] + pair_scores[j] - zeta[j]);
+            fTmp = logaddk(fTmp, beta_t1[j] + alpha[id] + pair_scores[j] - zeta[j]);
        }
    }

@ -4630,9 +4752,9 @@ __global__ void _rcrfBackwardComputeZeta(
    for (int m = 0; m < iNumLab; m++)
    {
        if (t == iNumPos - 1)
-            fSum = logadd(fSum, alpha[IDX2C(m, 0, iNumLab)]);
+            fSum = logaddk(fSum, alpha[IDX2C(m, 0, iNumLab)]);
        else
-            fSum = logadd(fSum, alpha[IDX2C(m, 0, iNumLab)] + pair_scores[m]);
+            fSum = logaddk(fSum, alpha[IDX2C(m, 0, iNumLab)] + pair_scores[m]);
    }

    gzeta[id] = fSum;
@ -4684,7 +4806,7 @@ __global__ void _rcrfTransGrdComputeZeta(
        else
            fTmp = alpha[m];

-        fSum = logadd(fSum, pair_scores[m] + fTmp);
+        fSum = logaddk(fSum, pair_scores[m] + fTmp);
    }

    gzeta[id] = fSum;
@ -4787,7 +4909,7 @@ __global__ void _reductionLogAddSum(
    {
        ElemType lSum = LZERO;
        if (tid < s){
-            lSum = logadd(partialLogAddSum[tid], partialLogAddSum[tid + s]);
+            lSum = logaddk(partialLogAddSum[tid], partialLogAddSum[tid + s]);
            partialLogAddSum[tid] = lSum;
        }
    }
@ -4912,4 +5034,6 @@ __global__ void _maskColumnsValue(ElemType *a, const char *columnsMask, CUDA_LON
    }
 }

+}}}
+
 #endif // !CPUONLY
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@ -34,11 +34,7 @@ static
 #endif
 cudaStream_t t_stream;

-
-// support for CudaCall() function template
-static const char * CudaErrString(cudaError_t x)    { cudaDeviceSynchronize(); return cudaGetErrorString(x); }
-static const char * CudaErrString(cublasStatus_t)   { cudaDeviceSynchronize(); return "(see cublas_api.h & look for cublasStatus_t or CUBLAS_STATUS_xxx)"; }
-static const char * CudaErrString(cusparseStatus_t) { cudaDeviceSynchronize(); return "(see cusparse.h & look for cusparseStatus_t or CUSPARSE_STATUS_xxx)"; }
+template<> const char * CudaErrString<cusparseStatus_t>(cusparseStatus_t) { cudaDeviceSynchronize(); return "(see cusparse.h & look for cusparseStatus_t or CUSPARSE_STATUS_xxx)"; }

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -137,14 +133,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ChangeDeviceTo(deepCopy.m_computeDevice);
        deepCopy.PrepareDevice();

-        Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.m_elemSizeAllocated, deepCopy.m_format, true, false);
+        Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.GetNumNZElements(), deepCopy.m_format, true, false);
        m_nz = deepCopy.m_nz;
-        m_sliceViewOffset = 0; // reset to zero as we only start copying starting from the offset in the source matrix
+        m_sliceViewOffset = 0; // reset to zero as we only start copying the indices starting from the offset in the source matrix

-        CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToDevice));
-        CUDA_CALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
+        CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
+        CUDA_CALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocationWithSliceViewOffset(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
        CUDA_CALL(cudaMemcpy(SecondaryIndexLocation(), deepCopy.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));

+        if (deepCopy.m_sliceViewOffset > 0)
+        {
+            int blocksPerGrid = (int)ceil(1.0*SecondaryIndexCount() / GridDim::maxThreadsPerBlock);
+            cudaEvent_t done = nullptr;
+            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+            _shiftColCSCIndexFromSliceViewToAbsolute<ElemType> << < blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream >> > (
+                SecondaryIndexLocation(),
+                SecondaryIndexCount()
+                );
+
+            if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+        }
+
        m_externalBuffer = false;
        SetMatrixName(deepCopy.m_matrixName);

@ -1002,7 +1013,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    template<class ElemType>
    void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA,
-        const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
+        const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
    {
        if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId()))
            RuntimeError("GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd: All matrices must be on the same GPU");
@ -1133,7 +1144,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        c.PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        CUDA_LONG N = (CUDA_LONG)c.GetNumCols();
+        CUDA_LONG N = (CUDA_LONG)c.GetNumNZElements();
        int blocksPerGrid = (int)ceil(1.0*N / GridDim::maxThreadsPerBlock);
        _tensorShuffleScaleAndAddRowSparse<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream >> >(
            reinterpret_cast<const ElemType*>(a.BufferPointer()),  // source nz values
@ -1142,7 +1153,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            reinterpret_cast<ElemType*>(c.BufferPointer()),  // target nz values
            c.RowLocation(),
            c.ColLocation(),
-            D, S, M, K, T);
+            D, S, M, K, T,
+            c.GetNumNZElements());
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1936,6 +1948,37 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return GPUSparseMatrix<ElemType>::InnerProductOfMatrices(b,a);
    }

+    template<class ElemType>
+    bool GPUSparseMatrix<ElemType>::IsValid() const
+    {
+        if (m_format != MatrixFormat::matrixFormatSparseCSC)
+            NOT_IMPLEMENTED;
+
+        PrepareDevice();
+        long *res = new long[3];
+        res[0] = 1;
+        res[1] = 0;
+        res[2] = 0;
+        long *d_res = nullptr;
+        CUDA_CALL(cudaMalloc((void**)&d_res, sizeof(long) * 3));
+        CUDA_CALL(cudaMemcpy(d_res, res, sizeof(long) * 3, cudaMemcpyHostToDevice));
+
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        int blocksPerGrid = (int)ceil((1.0*SecondaryIndexSize()) / GridDim::maxThreadsPerBlock);
+        _isValid<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(MajorIndexLocation(), SecondaryIndexLocation(), GetNumRows(), GetNumCols(), GetNumElemAllocated(), d_res);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        CUDA_CALL(cudaMemcpy(res, d_res, sizeof(long) * 3, cudaMemcpyDeviceToHost));
+        
+        if (res[0] == 1)
+            return true;
+        else
+            return false;
+    }
+
    template<class ElemType>
    bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, 
        const ElemType threshold)
--- a/Source/Math/GPUSparseMatrix.h
+++ b/Source/Math/GPUSparseMatrix.h
@ -73,18 +73,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // Special Note: for the matrix may be a read-only column slice view of another
        // matrix (only supported for CSC format today) and hence the NzValues needs
        // to be offset accordingly.
-        inline const ElemType* NzValues() const { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(m_sliceViewOffset); }
-        inline ElemType* NzValues() { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(m_sliceViewOffset); }
+        inline const ElemType* NzValues() const { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(0); }
+        inline ElemType* NzValues() { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(0); }
        inline size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use

        GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const //row/col ids in CSC/CSR format, blockId2col/blockId2row in BlockCol/BlockRow format
        { 
            return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); 
-        } 
+        }
+        
+        GPUSPARSE_INDEX_TYPE* MajorIndexLocationWithSliceViewOffset() const
+        {
+            return (MajorIndexLocation() + (m_format == matrixFormatSparseCSC ? SecondaryIndexValueAt(0) : 0));
+        }

        size_t MajorIndexCount() const
        {
-            return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format);
+            return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format);
        }
        size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const
        { 
@ -98,7 +103,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        size_t MajorIndexSize() const // actual number of major index bytes in use
        { 
            return sizeof(GPUSPARSE_INDEX_TYPE)*MajorIndexCount(); 
-        } 
+        }

        GPUSPARSE_INDEX_TYPE* SecondaryIndexLocation() const //compressed index, col/row in CSC/CSR format, col2blockId/row2blockId in BlockCol/BlockRow format
        { 
@ -239,6 +244,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void ConvertToSparseFormat(MatrixFormat newFormat);
        void ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const;

+        bool IsValid() const;
+
    public:
        GPUSparseMatrix<ElemType>& ElementInverse ();
        GPUSparseMatrix<ElemType>& AssignElementInverseOf (const GPUSparseMatrix<ElemType>& a);
@ -290,7 +297,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            const bool transposeB, GPUSparseMatrix<ElemType>& c);
        static void ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& c);
        static void ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs,
-            const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise);
+            const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise);
        static void TensorShuffleScaleAndAdd(ElemType keepWeight, const GPUSparseMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c);

        void NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum);
--- a/Source/Math/GPUTensor.cu
+++ b/Source/Math/GPUTensor.cu
@ -0,0 +1,693 @@
+//
+// <copyright file="GPUMatrix.cu" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#include "stdafx.h"
+#include "Basics.h"
+#include "BestGpu.h"
+
+#ifndef CPUONLY
+
+#include "GPUTensor.h"
+#include "GPUMatrix.h"
+#include "GPUMatrixCUDAKernels.cuh"
+#include "CommonMatrix.h"
+#define TENSOR_OPS_DECL __device__ __host__
+#include "TensorOps.h"
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+#include <assert.h>
+
+#ifndef let
+#define let const auto
+#endif
+
+#pragma comment (lib, "cudart.lib")     // instruct linker to reference these libs
+#pragma comment (lib, "cublas.lib")
+
+#pragma warning (disable: 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<<a,b>>> syntax if a and b are size_t
+#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
+#pragma warning (disable: 4702) // unreachable code; triggered for unknown reasons
+
+extern bool do_sync;
+
+#ifdef _WIN32
+// thread local storage to access the current stream, initalize to default stream
+__declspec (thread)
+#endif
+extern cudaStream_t t_stream;
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // =======================================================================
+    // TensorView support
+    // =======================================================================
+
+    // To save time, this makes extensive use of templates and macros.
+
+    // -----------------------------------------------------------------------
+    // simple fixed-size arrays for passing dimension information by value
+    // since CUDA can't just take our std::array and std::vector
+    // -----------------------------------------------------------------------
+
+    template<typename T, size_t N>
+    struct FixedArray
+    {
+        T m_data[N];
+        __device__ __host__ size_t size() const { return N; }
+        __device__ __host__ T & operator[](size_t n)       { return m_data[n]; }
+        __device__ __host__ T   operator[](size_t n) const { return m_data[n]; }
+        template<class VEC> FixedArray(const VEC & data)    // construct from CPU-side STL array or vector
+        {
+            assert(data.size() == N);
+            for (size_t n = 0; n < N; n++)
+            {
+                m_data[n] = (T)data[n];
+                if (m_data[n] != data[n])   // overflow check
+                    InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
+            }
+        }
+    };
+    template<typename T>        // specialized version for 0 elements
+    struct FixedArray<T, 0>
+    {
+        __device__ __host__ size_t size() const { return 0; }
+        template<class VEC> FixedArray(const VEC & data) { assert(data.size() == 0); UNUSED(data); }
+        FixedArray() { }
+    };
+
+    template<typename T, size_t N, size_t K>    // N = which input/output; K = index depth
+    struct FixedMatrix
+    {
+        T m_data[N][K];
+        __device__ __host__ size_t getNumRows() const { return N; }
+        __device__ __host__ size_t getNumCols() const { return K; }
+        __device__ __host__ T & operator()(size_t n, size_t k)       { return m_data[n][k]; }
+        __device__ __host__ T   operator()(size_t n, size_t k) const { return m_data[n][k]; }
+        template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data)  // construct from CPU-side array of vectors
+        {
+            assert(data.size() == N);
+            for (size_t n = 0; n < N; n++)
+            {
+                assert(data[n].size() == K);
+                for (size_t k = 0; k < K; k++)
+                {
+                    m_data[n][k] = (T)data[n][k];
+                    if (m_data[n][k] != data[n][k])   // overflow check
+                        InvalidArgument("FixedArray: Dimensions out of range, too few bits.");
+                }
+            }
+        }
+    };
+    template<typename T, size_t N>        // specialized version for 0 elements
+    struct FixedMatrix<T, N, 0>
+    {
+        __device__ __host__ size_t getNumRows() const { return N; }
+        __device__ __host__ size_t getNumCols() const { return 0; }
+        template<typename U> FixedMatrix(const array<SmallVector<U>, N> & data) { assert(data.size() == N); for (size_t n = 0; n < N; n++) assert(data[n].size() == 0); UNUSED(data); }
+        FixedMatrix() { }
+    };
+
+    // -----------------------------------------------------------------------
+    // function to actually compute a function of (N-1) inputs based on the opcode
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    struct TensorOps
+    {
+        static __device__ ElemType Compute(const FixedArray<ElemType*, 1> & pointers, ElementWiseOperator op)
+        {
+#define CaseNullaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper<ElemType>()
+            switch (op)
+            {
+            ForAllNullaryOps(CaseNullaryTensorOp);
+            default: return OpConstOne<ElemType>();   // (failure--we only have one nullary op, so use the same, maybe it will eliminate the switch altogether)
+            }
+        }
+        static __device__ ElemType Compute(const FixedArray<ElemType*, 2> & pointers, ElementWiseOperator op)
+        {
+            ElemType a = *(pointers[0]);
+#define CaseUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a)
+            switch (op)
+            {
+            ForAllUnaryOps(CaseUnaryTensorOp);
+            default: return 0;  // (failure)
+            }
+        }
+        static __device__ ElemType Compute(const FixedArray<ElemType*, 3> & pointers, ElementWiseOperator op)
+        {
+            //const ElemType & a = *(pointers[0]);    // const & for opIndex--costs quite some code bloat
+            ElemType a = *(pointers[0]);
+            ElemType b = *(pointers[1]);
+#define CaseBinaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b)
+            switch (op)
+            {
+            ForAllBinaryOps(CaseBinaryTensorOp);    // note: this costs about 6% compared to having only a single case
+            default: return 0;  // (failure)
+            }
+        }
+        static __device__ ElemType Compute(const FixedArray<ElemType*, 4> & pointers, ElementWiseOperator op)
+        {
+            ElemType a = *(pointers[0]);
+            ElemType b = *(pointers[1]);
+            ElemType c = *(pointers[2]);
+#define CaseTernaryTensorOp(oper) case ElementWiseOperator::op ## oper: return Op ## oper(a,b,c)
+            switch (op)
+            {
+            ForAllTernaryOps(CaseTernaryTensorOp);
+            default: return 0;  // (failure)
+            }
+        }
+    };
+
+    // -----------------------------------------------------------------------
+    // function to compute the value for a given output location (this version performs reduction if needed)
+    // -----------------------------------------------------------------------
+
+//#define ReduceElemType double
+#define ReduceElemType ElemType
+
+    template<class ElemType, C_size_t N, C_int M, C_int m>
+    struct TensorOpReduce
+    {
+        // this version for m >= 0
+        static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
+                                           const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
+        {
+            // start with index 0
+            // We may use 'double' since we are memory-bound anyway.
+            ReduceElemType aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
+            // apply this index to the pointers
+            C_size_t dim = reducingOpDims[m];
+            for (C_size_t k = 1/*done with k=0 already*/; k < dim; k++)
+            {
+                // bump the pointers
+                for (C_size_t i = 0; i < N - 1; i++)    // N-1 because output is not used here
+                    pointers[i] += reducingStrides(i,(C_size_t)m);
+                ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
+                aggregate += val;
+            }
+            return (ElemType)aggregate;
+        }
+    };
+
+    // this one terminates the template recursion over reduction dimensions
+    // The pointers are pointing to the input element.
+    template<class ElemType, C_size_t N, C_int M>
+    struct TensorOpReduce<ElemType, N, M, /*m=*/-1>
+    {
+        // this version for m = -1
+        // the pointers are pointing to the right location(s) to take the operation over
+        static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
+                                           const FixedArray<C_unsigned_int, M> & /*reducingOpDims*/, const FixedMatrix<C_int, N, M> & /*reducingStrides*/)
+        {
+            return TensorOps<ElemType>::Compute(pointers, op);   // finally computing something!
+        }
+    };
+
+    // -----------------------------------------------------------------------
+    // function to compute one constituent of the value for a given output location (this version has reduction done outside)
+    // -----------------------------------------------------------------------
+
+    template<class ElemType, C_size_t N, C_int M, C_int m>
+    struct TensorOpParallelReduce
+    {
+        // this version for m >= 0
+        static __device__ ElemType Compute(CUDA_LONG id, FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
+                                           const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides)
+        {
+            // map id (location on grid) to index[k]
+            C_size_t stride = 1;                    // compute the stride. This seems expensive, but since we we only currently support M <= 2, this is just compile-time selection between 1 and reducingOpDims[0].
+            for (int i = 0; i < m; i++)
+                stride *= reducingOpDims[(C_size_t)i];
+            C_size_t index = id / stride;           // this dimension. For m=0, the stride is 1 and hence the division will be removed at compile time.
+            id = id % stride;                       // remaining dimensions inside this. For m=0 this value is ignored and hence not even computed.
+            // apply this index to the pointers
+            for (C_size_t i = 0; i < N - 1; i++)
+                pointers[i] += index * reducingStrides(i, (C_size_t)m);    // now this dimension is taken care of
+            return TensorOpParallelReduce<ElemType, N, M, m - 1>::Compute(id, pointers, op, reducingOpDims, reducingStrides);
+        }
+    };
+
+    // this one terminates the template recursion over reduction dimensions
+    // The pointers are pointing to the input element.
+    template<class ElemType, C_size_t N, C_int M>
+    struct TensorOpParallelReduce<ElemType, N, M, /*m=*/-1>
+    {
+        // this version for m = -1
+        // the pointers are pointing to the right location(s) to take the operation over
+        static __device__ ElemType Compute(CUDA_LONG /*id*/, FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
+                                           const FixedArray<C_unsigned_int, M> & /*reducingOpDims*/, const FixedMatrix<C_int, N, M> & /*reducingStrides*/)
+        {
+            return TensorOps<ElemType>::Compute(pointers, op);   // finally computing something!
+        }
+    };
+
+    // -----------------------------------------------------------------------
+    // perform loop over regular index k for N-nary operations (N counting the output)
+    // -----------------------------------------------------------------------
+
+    // The canonical case, vector op without reduction, is this PTX function:
+    // _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi3ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
+    //                                   float ^      ^ aggregate loop
+    //                                      args? ^       ^ input dims
+    // _ZN9Microsoft3MSR4CNTK15_launchTensorOpIfLi2ELi0ELi1EEEvT_NS1_10FixedArrayIPS3_XT0_EEES3_NS1_19ElementWiseOperatorENS4_IiXT2_EEENS1_11FixedMatrixIiXT0_EXT2_EEENS4_IiXT1_EEENS9_IiXT0_EXT1_EEEi
+
+    // The 'pointers' only refer to a single element, so we will bump them in-place to perform indexing.
+    template<class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce, C_int k>
+    struct TensorOpElement
+    {
+        // template-recursive version loops over indices
+        static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
+                                       const FixedArray<C_unsigned_int, K> & regularOpStrides,  const FixedMatrix<C_int, N, K> & regularStrides,
+                                       const FixedArray<C_unsigned_int, M> & reducingOpDims,   const FixedMatrix<C_int, N, M> & reducingStrides,
+                                       CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
+        {
+            // map id (location on grid) to index[k]
+            C_size_t stride = regularOpStrides[(C_size_t)k];
+            C_size_t index = id / stride;             // this dimension
+            id = id % stride;                       // remaining dimensions inside this
+            // apply this index to the pointers
+            for (C_size_t i = 0; i < N; i++)
+                pointers[i] += index * regularStrides(i,(C_size_t)k);    // now this dimension is taken care of
+            // process the previous index
+            TensorOpElement<ElemType, N, M, K, parallelReduce, k - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
+        }
+    };
+
+    // specialization for k=0 where op stride is guaranteed to be 1
+    template<class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce>
+    struct TensorOpElement<ElemType, N, M, K, parallelReduce, /*k=*/0>
+    {
+        // template-recursive version loops over indices
+        static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
+                                       const FixedArray<C_unsigned_int, K> & regularOpStrides,  const FixedMatrix<C_int, N, K> & regularStrides,
+                                       const FixedArray<C_unsigned_int, M> & reducingOpDims,    const FixedMatrix<C_int, N, M> & reducingStrides,
+                                       CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
+        {
+            // map id (location on grid) to index[k]
+            C_size_t index = id;                      // this dimension
+            // apply this index to the pointers
+            for (C_size_t i = 0; i < N; i++)
+                pointers[i] += index * regularStrides(i,0);    // now this dimension is taken care of
+            // process the previous index
+            TensorOpElement<ElemType, N, M, K, parallelReduce, -1>::Compute(/*id*/0, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
+        }
+    };
+
+    //// apply beta and alpha and save
+    //template<class ElemType, class PointersType>
+    //static __device__ void SetFinalValue(ElemType val, ElemType beta, const PointersType & pointers, ElemType alpha)
+    //{
+    //    // scale
+    //    val *= alpha;
+    //    // combine with previous value in target matrix, then write it out
+    //    auto * pout = pointers[pointers.size() - 1];
+    //    if (beta != 0)
+    //        val += beta * *pout;
+    //    // save
+    //    *pout = val;
+    //}
+
+    // specialization for k = -1 terminates the template recursion, and computes reductions in a for loop
+    template<class ElemType, C_size_t N, C_int M, C_int K>
+    struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/false, /*k=*/-1>
+    {
+        // template-recursion-teminating version computes the actual value for this output location
+        // now the output pointers point to the right element (input pointers may still iterate for reduction)
+        static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
+                                       const FixedArray<C_unsigned_int, K> & /*regularOpStrides*/, const FixedMatrix<C_int, N, K> & /*regularStrides*/,
+                                       const FixedArray<C_unsigned_int, M> & reducingOpDims, const FixedMatrix<C_int, N, M> & reducingStrides, CUDA_LONG /*reductionBegin*/, CUDA_LONG /*reductionChunkSize*/)
+        {
+            // compute the operation for this output coordinate
+            // This may still involve a reduction over inverse-broadcasting dimensions.
+            ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
+            // scale
+            val *= alpha;
+            // combine with previous value in target matrix, then write it out
+            auto * pout = pointers[pointers.size() - 1];
+            if (beta != 0)
+                val += beta * *pout;
+            // save
+            *pout = val;
+        }
+    };
+
+    // specialization for k = -1 terminates the template recursion, and computes reductions in parallel
+    template<class ElemType, C_size_t N, C_int M, C_int K>
+    struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
+    {
+        // template-recursion-teminating version computes the actual value for this output location
+        // now the output pointers point to the right element (input pointers may still iterate for reduction)
+        static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
+                                       const FixedArray<C_unsigned_int, K> & /*regularOpStrides*/, const FixedMatrix<C_int, N, K> & /*regularStrides*/,
+                                       const FixedArray<C_unsigned_int, M> & reducingOpDims,       const FixedMatrix<C_int, N, M> & reducingStrides, CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
+        {
+            CUDA_LONG reductionBlock = blockIdx.z;          // block index  --larger reductions are split into blocks
+            CUDA_LONG reductionBlocks = gridDim.z;          // number of blocks
+            CUDA_LONG tid = threadIdx.x;                    // thread index
+            CUDA_LONG tids = blockDim.x;                    // out of how many threads  --note: last block is partial
+
+            // determine our range  --this is a single int mul, we can stomach it (we could alternatively pass in yet another parameter)
+            CUDA_LONG reductionDim = (CUDA_LONG)reducingOpDims[0];
+            for (C_size_t i = 1; i < reducingOpDims.size(); i++)
+                reductionDim *= reducingOpDims[i];
+
+            // determine the redId range that we operate on
+            // Each thread takes a stride tid + (multiples of tids) within this range.
+            reductionBegin += reductionChunkSize * reductionBlock;
+            CUDA_LONG reductionEnd = min(reductionBegin + reductionChunkSize, reductionDim);
+
+            // compute the operation for this input coordinate
+            ReduceElemType sum = 0;
+            for (CUDA_LONG redId = reductionBegin + tid; redId < reductionEnd; redId += tids)
+            {
+                auto val = TensorOpParallelReduce<ElemType, N, M, M - 1>::Compute(redId, pointers, op, reducingOpDims, reducingStrides);
+                sum += val;
+            }
+
+            // reduce    --cf https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf
+            __shared__ ReduceElemType accumulators[GridDim::maxThreadsPerBlock/*tids*/];
+            accumulators[tid] = sum;
+            __syncthreads();
+            static_assert(GridDim::maxThreadsPerBlock <= 512, "GridDim::maxThreadsPerBlock too large, need to add manually unrolled steps");
+            for (CUDA_LONG i = 256; i; i >>= 1)
+            {
+                if (tid < i && tid + i < tids) accumulators[tid] += accumulators[tid + i];
+                if (0 + i < tids) __syncthreads();    // sync if condition true for at least one thread
+                // TODO: use volatile* and then we can skip the __syncthreads() for the last 32 values
+            }
+
+            // now set final value to output coordinate
+            if (tid == 0)
+            {
+                ElemType val = (ElemType)accumulators[0];
+                // scale
+                val *= alpha;
+                // combine with previous value in target matrix, then write it out
+                auto * pout = pointers[pointers.size() - 1];
+                if (reductionBlocks > 1)        // multiple blocks: need to use atomicAdd()
+                {
+                    // in this case, outer calling code must pass beta = 1
+                    val = atomicAdd(pout, val);
+                }
+                else
+                {
+                    if (beta != 0)
+                        val += beta * *pout;
+                    // save
+                    *pout = val;
+                }
+            }
+        }
+    };
+
+    // -----------------------------------------------------------------------
+    // kernel and launch  --no reduction
+    // -----------------------------------------------------------------------
+
+    // launch tensor op with CUDA
+    template<class ElemType, C_size_t N, C_int M, C_int K>
+    __global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
+                                    FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides,  CUDA_LONG numElements,
+                                    FixedArray<C_unsigned_int, M> reducingOpDims,   FixedMatrix<C_int, N, M> reducingStrides)
+    {
+        CUDA_LONG id = GridDim::GetLinearThreadId();
+        if (id < numElements)       // note: there are no __syncthread() calls inside
+            TensorOpElement<ElemType, N, M, K, false, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, 0, 0);
+    }
+
+    template<class ElemType, C_size_t N, C_int K>
+    static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
+                               const SmallVector<size_t> & regularOpDims,       const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors)
+    {
+        // copy all parameters to CUDA-compatible data structures
+        FixedArray<ElemType*, N> pointers(pointerVector);
+        SmallVector<C_size_t> regularOpStrideVector;    // kernel needs the strides for converting thread index back to multi-dimensional tensor index
+        C_size_t numElements = 1;
+        for (C_size_t k = 0; k < regularOpDims.size(); k++)
+        {
+            regularOpStrideVector.push_back(numElements);
+            numElements *= (C_size_t)regularOpDims[k];
+        }
+        FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
+        FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
+        FixedArray<C_unsigned_int, /*M=*/0> reducingOpDims; // empty reduction dimensions
+        FixedMatrix<C_int, N, /*M=*/0> reducingStrides;
+
+        // launch the kernel
+        CUDA_LONG NN = (CUDA_LONG)numElements;      // linear space identifying each individual input element
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        GridDim grid(NN);
+        _launchTensorOp<ElemType, N, /*M=*/0, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    // -----------------------------------------------------------------------
+    // kernel and launch  --with reduction
+    // -----------------------------------------------------------------------
+
+    template<class ElemType, C_size_t N, C_int M, C_int K>
+    __global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
+                                                 FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides,  CUDA_LONG numElements,
+                                                 FixedArray<C_unsigned_int, M> reducingOpDims,   FixedMatrix<C_int, N, M> reducingStrides, CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
+    {
+        CUDA_LONG id = gridDim.x * blockIdx.y + blockIdx.x;   // input dimensions are Y dimension of blocks in this case, so we can use thread dim for shared-memory/parallelization
+        if (id < numElements)       // note: we have __syncthread() calls but only entire blocks in sync, so this is OK
+            TensorOpElement<ElemType, N, M, K, true, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
+    }
+
+    // All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
+    template<class ElemType, C_size_t N, C_int M, C_int K>
+    static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
+                                            const SmallVector<size_t> & regularOpDims,       const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
+                                            const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
+    {
+        // copy all parameters to CUDA-compatible data structures
+        FixedArray<ElemType*, N> pointers(pointerVector);
+        SmallVector<C_size_t> regularOpStrideVector;    // kernel needs the strides for converting thread index back to multi-dimensional tensor index
+        C_size_t numElements = 1;
+        for (C_size_t k = 0; k < regularOpDims.size(); k++)
+        {
+            regularOpStrideVector.push_back(numElements);
+            numElements *= (C_size_t)regularOpDims[k];
+        }
+        FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
+        FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
+        FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
+        FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
+
+        // launch the kernel
+        CUDA_LONG NN = (CUDA_LONG)numElements;      // linear space identifying each individual input element
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+
+        // do some optimization for reductions
+        // Cases:
+        //  - #output elements >= GPU procs  -->  use one proc per element, do reduction in inner loop
+        //  - reduction dimension fits into a single kernel  -->  launch it that way
+        //  - reduction dimension requires multiple kernels  -->  use atomic add, to avoid temp mem alloc
+        //     - PlusNode: reducing to a bias for small matrices
+        //     - ScaleNode: big elementwise product reduced to a scalar (dot product)
+        //     - E.g. 3072 GPU procs:
+        //       If >= 3072 reduced output values must be computed, just loop inside.
+        //       If less, and reduction per value does not fit into a single proc,
+        //       then we break it into procs, say, 24.
+        //       This way we will need 24 atomicAdd()s of 3072/24 = 128 values.
+        //       If reduction is along stride=1, then we'd have 24 atomicAdd()s of 32 coalesced writes.
+        //       Does not sound scary at all.
+        //       Precondition: matrix cannot at the same time participate in reduction and operation.
+        C_size_t reductionDim = 1;  // number of elements to reduce over
+        for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
+            reductionDim *= (C_size_t)reducingOpDimVector[k];
+        let & props = GridDim::GetDeviceProps();
+        GridDim grid(NN);
+        if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount  /*    && NN == 10 && reductionDim <= GridDim::maxThreadsPerBlock*/)
+        {
+            // we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
+            // Change of strategy: All NN elements get their own block. Reduction gets split over blocks as well.
+
+            // By how much do we underutilize?
+            // We increase #blocks by that factor by breaking reduction into that many chunks.
+            let numReductionChunks = CeilDiv(props.multiProcessorCount, NN);
+
+            // NN may be too large for a single dimension
+            let blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
+            let numBlocksX = CeilDiv(NN, blockXOverBy);
+            let numBlocksY = CeilDiv(NN, numBlocksX);
+            let numBlocksZ = numReductionChunks;
+            // Block dim is now:
+            //  - X, Y: such that X*Y covers NN
+            //  - Z: reduction chunks
+
+            // reduction goes into thread dim X
+            let reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
+            let numThreadsX = min(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
+
+            if (beta == 1 || numBlocksZ == 1)
+            {
+                _launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
+            }
+            else
+            {
+                // We need more than one chunk, we will use atomicAdd().
+                // First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
+                _launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
+                _launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
+            }
+        }
+        else
+        {
+            // we got enough elements to generate: do one element per thread, and reduction inside
+            _launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
+        }
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    // -----------------------------------------------------------------------
+    // kernel and launch  --linear unary
+    // -----------------------------------------------------------------------
+
+    // for linear unary ops, we need to define a functor for every function for use as a template parameter (lambda syntax doesn't work in CUDA 7)
+    #define DefineUnaryTensorFunctor(oper) \
+        struct Functor ## oper { template<class ElemType> static __device__ ElemType f(ElemType a) { return Op ## oper(a); } };
+    ForAllUnaryOps(DefineUnaryTensorFunctor);
+
+    // the top-level kernel for linear unary ops
+    // Note: If we have a beta, we have 2 memory accesses, so this optimization may no longer be needed as we are memory-bound.
+    template<class ElemType, class FN>
+    __global__ void _launchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, CUDA_LONG numElements)
+    {
+        CUDA_LONG id = GridDim::GetLinearThreadId();
+        if (id >= numElements)
+            return;
+        ElemType a = pa[id];
+        ElemType val = FN::f(a);
+        val *= alpha;
+        if (beta != 0)
+            val += beta * pb[id];
+        pb[id] = val;
+    }
+    // version without beta and alpha
+    template<class ElemType, class FN>
+    __global__ void _launchUnaryTensorOp(const ElemType * pa, ElemType * pb, CUDA_LONG numElements)
+    {
+        CUDA_LONG id = GridDim::GetLinearThreadId();
+        if (id >= numElements)
+            return;
+        ElemType a = pa[id];
+        ElemType val = FN::f(a);
+        pb[id] = val;
+    }
+
+    // special case of linear unary operation
+    template<class ElemType>
+    void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim)
+    {
+//////if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp: %d", (int)__LINE__);
+        CUDA_LONG NN = (CUDA_LONG)regularOpDim;
+
+        #define CaseLaunchUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: \
+            if (beta == 0 && alpha == 1) \
+                return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(pa, pb, NN); \
+            else \
+                return _launchUnaryTensorOp<ElemType,Functor ## oper> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pa, pb, alpha, NN);
+
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        GridDim grid(NN);
+        switch (op)
+        {
+        ForAllUnaryOps(CaseLaunchUnaryTensorOp);
+        default: LogicError("LaunchTensorOp1: Unknown op code %d.", (int)op);
+        }
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+    }
+
+    // -----------------------------------------------------------------------
+    // map runtime parameters N to template parameters
+    // -----------------------------------------------------------------------
+
+    // tensor operation with k+1 dimensions (-1 means scalar)
+    template<class ElemType, C_size_t N, C_int K>
+    static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, ElementWiseOperator op,
+                                        const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, N> & regularStrides,
+                                        const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
+    {
+        size_t dims = reducingOpDims.size();
+        switch (dims)
+        {
+        case 2: return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 1: return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 0: return LaunchTensorOp<ElemType, N, K>(beta, pointers, alpha, op, regularOpDims, regularStrides);
+        default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (C_int)dims);
+        }
+    }
+
+    // tensor operation, generalized in number of arguments
+    // This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
+    template<class ElemType, C_size_t N>
+    void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
+                        const array<size_t, N> & offsets,
+                        const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, N> & regularStrides,
+                        const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides)
+    {
+        for (C_size_t i = 0; i < N; i++)  // N = a small constant, this will be unrolled
+            pointers[i] += offsets[i];
+        size_t dims = regularOpDims.size();
+        switch (dims)
+        {
+        case 4: return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 3: return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 2: return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 1: return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 0: return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (C_int)dims);
+        }
+    }
+
+    //------------------------------------------------------------------------
+    // explicit instantiations--these are being called from GPUMatrix.cu
+    //------------------------------------------------------------------------
+
+    template void TensorOpN<float,  2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op,
+                                       const array<size_t, 2> & offsets,
+                                       const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, 2> & regularStrides,
+                                       const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 2> & reducingStrides);
+    template void TensorOpN<float,  3>(float beta, array<float*, 3> pointers, float alpha, ElementWiseOperator op,
+                                       const array<size_t, 3> & offsets,
+                                       const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, 3> & regularStrides,
+                                       const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 3> & reducingStrides);
+    template void TensorOpN<float,  4>(float beta, array<float*, 4> pointers, float alpha, ElementWiseOperator op,
+                                       const array<size_t, 4> & offsets,
+                                       const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, 4> & regularStrides,
+                                       const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 4> & reducingStrides);
+    template void TensorOpN<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op,
+                                       const array<size_t, 2> & offsets,
+                                       const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, 2> & regularStrides,
+                                       const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 2> & reducingStrides);
+    template void TensorOpN<double, 3>(double beta, array<double*, 3> pointers, double alpha, ElementWiseOperator op,
+                                       const array<size_t, 3> & offsets,
+                                       const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, 3> & regularStrides,
+                                       const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 3> & reducingStrides);
+    template void TensorOpN<double, 4>(double beta, array<double*, 4> pointers, double alpha, ElementWiseOperator op,
+                                       const array<size_t, 4> & offsets,
+                                       const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, 4> & regularStrides,
+                                       const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 4> & reducingStrides);
+
+    template void LaunchUnaryTensorOp(float beta, const float * pa, float * pb, float alpha, ElementWiseOperator op, size_t regularOpDim);
+    template void LaunchUnaryTensorOp(double beta, const double * pa, double * pb, double alpha, ElementWiseOperator op, size_t regularOpDim);
+
+}}}
+
+#endif // CPUONLY
--- a/Source/Math/GPUTensor.h
+++ b/Source/Math/GPUTensor.h
@ -0,0 +1,30 @@
+//
+// <copyright file="GPUTensor.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+#pragma once
+#include "CommonMatrix.h"
+#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
+#include "GPUMatrixCUDAKernels.cuh"
+#include <array>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // GPUMatrix::TensorOp() interfaces with actual tensor code through these two functions, which are independent of the GPUMatrix class
+
+#define C_size_t       CUDA_LONG
+#define C_int          CUDA_LONG
+#define C_unsigned_int CUDA_LONG
+
+    template<class ElemType, C_size_t N>
+    void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
+                   const array<size_t, N> & offsets,
+                   const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, N> & regularStrides,
+                   const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, N> & reducingStrides);
+
+    template<class ElemType>
+    void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim);
+
+}}}
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -156,7 +156,7 @@
    </ProjectReference>
  </ItemDefinitionGroup>
  <ItemGroup>
-    <ClInclude Include="..\Common\Include\DataTensor.h" />
+    <ClInclude Include="..\Common\Include\TensorShape.h" />
    <ClInclude Include="..\Common\Include\File.h" />
    <ClInclude Include="..\Common\Include\fileutil.h" />
    <ClInclude Include="..\Common\Include\DebugUtil.h" />
--- a/Source/Math/Math.vcxproj.filters
+++ b/Source/Math/Math.vcxproj.filters
@ -1,9 +1,7 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
-    <ClCompile Include="dllmain.cpp" />
    <ClCompile Include="Matrix.cpp" />
-    <ClCompile Include="stdafx.cpp" />
    <ClCompile Include="..\Common\File.cpp">
      <Filter>Common</Filter>
    </ClCompile>
@ -25,22 +23,31 @@
    <ClCompile Include="MatrixQuantizerCPU.cpp">
      <Filter>CPU\1bitSGD</Filter>
    </ClCompile>
-    <ClCompile Include="MatrixQuantizer.cpp" />
-    <ClCompile Include="QuantizedMatrix.cpp" />
    <ClCompile Include="CUDAPageLockedMemAllocator.cpp">
      <Filter>GPU\1bitSGD</Filter>
    </ClCompile>
-    <ClCompile Include="ConvolutionEngine.cpp" />
    <ClCompile Include="TensorView.cpp">
      <Filter>Tensors</Filter>
    </ClCompile>
+    <ClCompile Include="dllmain.cpp">
+      <Filter>Misc</Filter>
+    </ClCompile>
+    <ClCompile Include="ConvolutionEngine.cpp">
+      <Filter>Convolution</Filter>
+    </ClCompile>
+    <ClCompile Include="stdafx.cpp">
+      <Filter>Misc</Filter>
+    </ClCompile>
+    <ClCompile Include="QuantizedMatrix.cpp">
+      <Filter>1bitSGD</Filter>
+    </ClCompile>
+    <ClCompile Include="MatrixQuantizer.cpp">
+      <Filter>1bitSGD</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="CommonMatrix.h" />
-    <ClInclude Include="Helpers.h" />
    <ClInclude Include="Matrix.h" />
-    <ClInclude Include="stdafx.h" />
-    <ClInclude Include="targetver.h" />
    <ClInclude Include="..\Common\Include\File.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
@ -59,23 +66,40 @@
    <ClInclude Include="MatrixQuantizerCPU.h">
      <Filter>CPU\1bitSGD</Filter>
    </ClInclude>
-    <ClInclude Include="MatrixQuantizer.h" />
-    <ClInclude Include="QuantizedMatrix.h" />
    <ClInclude Include="MemAllocator.h" />
    <ClInclude Include="CUDAPageLockedMemAllocator.h">
      <Filter>GPU\1bitSGD</Filter>
    </ClInclude>
-    <ClInclude Include="..\Common\Include\DebugUtil.h" />
-    <ClInclude Include="ConvolutionEngine.h" />
    <ClInclude Include="TensorView.h">
      <Filter>Tensors</Filter>
    </ClInclude>
    <ClInclude Include="TensorOps.h">
      <Filter>Tensors</Filter>
    </ClInclude>
-    <ClInclude Include="..\Common\Include\DataTensor.h">
+    <ClInclude Include="..\Common\Include\TensorShape.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
+    <ClInclude Include="Helpers.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Common\Include\DebugUtil.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
+    <ClInclude Include="ConvolutionEngine.h">
+      <Filter>Convolution</Filter>
+    </ClInclude>
+    <ClInclude Include="stdafx.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
+    <ClInclude Include="targetver.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
+    <ClInclude Include="QuantizedMatrix.h">
+      <Filter>1bitSGD</Filter>
+    </ClInclude>
+    <ClInclude Include="MatrixQuantizer.h">
+      <Filter>1bitSGD</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="GPUMatrix.h">
@ -113,5 +137,14 @@
    <Filter Include="Tensors">
      <UniqueIdentifier>{70fb07cf-603e-4444-bc10-f0add4920fd2}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Misc">
+      <UniqueIdentifier>{62b92193-92d0-4e5b-8c3e-67ffd01a98c0}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Convolution">
+      <UniqueIdentifier>{3a49e94d-14ee-4ca1-a56e-a1472206a076}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="1bitSGD">
+      <UniqueIdentifier>{546cacbd-253e-485b-8c8c-8b9ee0e2f631}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
 </Project>
--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@ -157,7 +157,9 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
    <ClInclude Include="cudalatticeops.h" />
    <ClInclude Include="cudalib.h" />
    <ClInclude Include="CuDnnConvolutionEngine.h" />
+    <ClInclude Include="GPUTensor.h" />
    <ClInclude Include="latticefunctionskernels.h" />
+    <ClInclude Include="TensorOps.h" />
    <ClInclude Include="ValueQuantizer.h" />
    <None Include="GPUWatcher.h">
      <FileType>CppHeader</FileType>
@ -171,6 +173,10 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
+    <CudaCompile Include="GPUTensor.cu">
+      <InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</InterleaveSourceInPTX>
+      <Keep Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</Keep>
+    </CudaCompile>
    <CudaCompile Include="cudalatticeops.cu">
      <FileType>CppCode</FileType>
    </CudaCompile>
@ -202,7 +208,7 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
    <CudaCompile Include="GPUMatrix.cu">
      <FileType>CppCode</FileType>
      <Keep Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</Keep>
-      <InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</InterleaveSourceInPTX>
+      <InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</InterleaveSourceInPTX>
    </CudaCompile>
    <CudaCompile Include="GPUMatrixCUDAKernels.cuh">
      <ExcludedFromBuild>true</ExcludedFromBuild>
--- a/Source/Math/MathCUDA.vcxproj.filters
+++ b/Source/Math/MathCUDA.vcxproj.filters
@ -22,25 +22,28 @@
    <CudaCompile Include="GPUMatrixCUDAKernels.cuh">
      <Filter>GPU</Filter>
    </CudaCompile>
+    <CudaCompile Include="GPUTensor.cu">
+      <Filter>GPU\Tensors</Filter>
+    </CudaCompile>
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="stdafx.cpp" />
    <ClCompile Include="cudalattice.cpp">
      <Filter>GPU\SequenceTraining</Filter>
    </ClCompile>
    <ClCompile Include="cudalib.cpp">
      <Filter>GPU\SequenceTraining</Filter>
    </ClCompile>
-    <ClCompile Include="..\Common\DebugUtil.cpp" />
+    <ClCompile Include="..\Common\DebugUtil.cpp">
+      <Filter>Misc</Filter>
+    </ClCompile>
+    <ClCompile Include="stdafx.cpp">
+      <Filter>Misc</Filter>
+    </ClCompile>
    <ClCompile Include="CuDnnConvolutionEngine.cpp">
-      <Filter>GPU</Filter>
+      <Filter>GPU\Convolution</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="CommonMatrix.h" />
-    <ClInclude Include="Helpers.h" />
-    <ClInclude Include="stdafx.h" />
-    <ClInclude Include="targetver.h" />
    <ClInclude Include="..\Common\Include\File.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
@ -80,8 +83,26 @@
    <ClInclude Include="latticefunctionskernels.h">
      <Filter>GPU\SequenceTraining</Filter>
    </ClInclude>
+    <ClInclude Include="GPUTensor.h">
+      <Filter>GPU\Tensors</Filter>
+    </ClInclude>
+    <ClInclude Include="Helpers.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
+    <ClInclude Include="stdafx.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
+    <ClInclude Include="targetver.h">
+      <Filter>Misc</Filter>
+    </ClInclude>
+    <ClInclude Include="CommonMatrix.h">
+      <Filter>from Math</Filter>
+    </ClInclude>
    <ClInclude Include="CuDnnConvolutionEngine.h">
-      <Filter>GPU</Filter>
+      <Filter>GPU\Convolution</Filter>
+    </ClInclude>
+    <ClInclude Include="TensorOps.h">
+      <Filter>from Math</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
@ -105,14 +126,23 @@
    <Filter Include="GPU">
      <UniqueIdentifier>{cc9a219d-d8ab-484a-b253-fd2a29ad7c7c}</UniqueIdentifier>
    </Filter>
-    <Filter Include="Include">
-      <UniqueIdentifier>{3c982109-64b1-469a-8d85-2abdf12d636a}</UniqueIdentifier>
-    </Filter>
    <Filter Include="GPU\1bitSGD">
      <UniqueIdentifier>{3415233d-9ef7-41c6-abbb-cec1b4f8d14c}</UniqueIdentifier>
    </Filter>
    <Filter Include="GPU\SequenceTraining">
      <UniqueIdentifier>{6a3569b1-6c9e-47b3-870f-bb581349e75e}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Misc">
+      <UniqueIdentifier>{3c982109-64b1-469a-8d85-2abdf12d636a}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="GPU\Tensors">
+      <UniqueIdentifier>{16214e65-2d24-4e4c-a0dd-c37e505bda32}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="from Math">
+      <UniqueIdentifier>{b1b59e2e-5c54-4e40-ad0a-1523ddeb63ba}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="GPU\Convolution">
+      <UniqueIdentifier>{3155488f-128f-494e-858d-459b4cc9fab7}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
 </Project>
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -3152,6 +3152,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            );
    }

+    template<class ElemType>
+    bool Matrix<ElemType>::IsValid() const
+    {
+        if (m_currentDataLocation == CurrentDataLocation::GPU && GetMatrixType() == MatrixType::SPARSE)
+        {
+            return this->m_GPUSparseMatrix->IsValid();
+        }
+        else
+        {
+            NOT_IMPLEMENTED;
+        }
+
+        return false;
+    }
+
    template<class ElemType>
    bool Matrix<ElemType>::IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold /*= 1e-8*/) const
    {
@ -4321,7 +4336,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
    template<class ElemType>
    void Matrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB,
-        ElemType beta, Matrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
+        ElemType beta, Matrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
    {
        DecideAndMoveToRightDevice(a, b, c);
        
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -13,7 +13,7 @@
 #include "Basics.h"
 #include "File.h"
 #include "CommonMatrix.h"
-#include "DataTensor.h" // only for SmallVector; I was hoping to keep this out
+#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
 #include <limits.h>
 #include <memory>       // for shared_ptr
 #include <array>
@ -348,7 +348,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        Matrix<ElemType>&  AssignPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
        Matrix<ElemType>&  AddFoldedPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
-        
+
+        bool IsValid() const;
        bool IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold = 1e-8) const;

        static void VectorSum(const Matrix<ElemType>& a, Matrix<ElemType>& c, const bool isColWise);
@ -437,7 +438,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static void Multiply(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
        static void Multiply(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
        static void Multiply1x1AndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType beta, Matrix<ElemType>& c);
-        static void ConvolveAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, ElemType beta, Matrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise);
+        static void ConvolveAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, ElemType beta, Matrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise);

        static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
        static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, ElemType beta, Matrix<ElemType>& c);
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -13,7 +13,7 @@
 #include "GPUSparseMatrix.h"
 #include "MatrixQuantizerGPU.h"
 #include "CuDnnConvolutionEngine.h"
-#include "DataTensor.h"
+#include "TensorShape.h"

 #pragma warning (disable: 4100) // unreferenced formal parameter, which is OK since all functions in here are dummies; disabling this allows to copy-paste prototypes here when we add new functions
 #pragma warning (disable: 4702) // unreachable code, which we get from the NOT_IMPLEMENTED macro which is OK
@ -368,10 +368,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType> void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat) {}
    template<class ElemType> void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const {}

-    template<class ElemType> void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, int numChannels, size_t horizontalSubsample, bool padding, bool channelwise) { };
+    template<class ElemType> void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise) { };
    template<class ElemType> void GPUSparseMatrix<ElemType>::TensorShuffleScaleAndAdd(ElemType keepWeight, const GPUSparseMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c) { }
    template<class ElemType> void GPUSparseMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols) { }

+    template<class ElemType> bool GPUSparseMatrix<ElemType>::IsValid() const { return true; }
+
    template<class ElemType> template <class OutType, class InType>
    void GPUSparseMatrix<ElemType>::CopyBuffer(OutType * outBuffer, const InType * inBuffer, const size_t size){}

--- a/Source/Math/TensorOps.h
+++ b/Source/Math/TensorOps.h
@ -25,18 +25,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // -----------------------------------------------------------------------
    // unified overloads for float/double math functions
    //
-    // Declare float and double versions of the functions f we need as f_(),
-    // e.g. exp_ -> exp(double), expf(float).
+    // Declare float and double versions of the functions x we need as x_().
+    // This macro overloads x_() with float and double arguments, and inlines the correct library function,
+    // e.g. exp_ -> exp(double), expf(float). This simplifies templated kernel code.
    // -----------------------------------------------------------------------

 #pragma push_macro("OverloadUnaryMathFns")
-    #define OverloadUnaryMathFns(func) \
-        DECL float func ## _(float arg) { return func ## f(arg); } \
-        DECL double func ## _(double arg) { return func(arg); }
+    #define OverloadUnaryMathFns(x) DECL float x ## _(float f) { return x ## f(f); } DECL double x ## _(double f) { return x(f); }
+
+    OverloadUnaryMathFns(exp);
+    OverloadUnaryMathFns(log);
+    OverloadUnaryMathFns(tanh);
+    OverloadUnaryMathFns(sqrt);
+    OverloadUnaryMathFns(fabs);
+    OverloadUnaryMathFns(cos);
+    OverloadUnaryMathFns(sin);

-    OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt);
-    OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log);
-    OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin);
 #pragma push_macro("OverloadUnaryMathFns")

    // -----------------------------------------------------------------------
@ -46,6 +50,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    DECL ElemType Sigmoid(ElemType z)
    {
+#if 1   // BUGBUG: Numerically bad. But if I don't use this, results change.
+        ElemType negElem = -z;
+        ElemType e = exp_(negElem);
+
+        return 1 / (e + 1);
+#else
 #if 1   // Efficient implementation that avoids to divergent CUDA code paths that both compute exp() [jdroppo]. This version compiles to PTX without branches.
        ElemType q = exp_(-fabs_(z));
        ElemType numer;
@ -62,6 +72,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ElemType v = exp_(z);
            return v / (1 + v);
        }
+#endif
 #endif
    }

@ -85,7 +96,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return sqrt_(z > 0 ? z : 0);
    }

-    // TODO: call this LogAdd() for consistency
+    template<class ElemType>
+    DECL ElemType ClippedLog(ElemType z)
+    {
+        return z < EPS_IN_LOG ? LOG_OF_EPS_IN_LOG : log_(z);
+    }
+
+    template<class ElemType>
+    DECL ElemType ClippedQuotient(ElemType a, ElemType b)
+    {
+        if (fabs(b) < EPS_IN_INVERSE)   // clip the denominator
+        {
+            if (b > 0)
+                b = EPS_IN_INVERSE;
+            else
+                b = -EPS_IN_INVERSE;
+        }
+        return a / b;
+    }
+
    template<typename ElemType>
    DECL ElemType LogAdd(ElemType x, ElemType y)
    {
@ -105,37 +134,59 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
    }

+    template<class ElemType> DECL ElemType Sqr(ElemType z) { return z * z; }
+
+    // IndexElement reindexes a tensor along one dimension.
+    // For the indexed dimension, the tensor op is prepared by setting 'a' to be broadcasting along the indexed dimension.
+    // I.e. pa = &a points to the first element (as if index == 0).
+    // This function then must now adjust the address:
+    //  pa <- pa + stride * index
+    // The stride is passed in as third parameter.
+    //template<class ElemType> DECL ElemType IndexElement(const ElemType & a, ElemType b, int stride) { const ElemType * pa = &a; return pa[stride * (ptrdiff_t)b]; }
+
    // -----------------------------------------------------------------------
    // ElementWiseOperator implementations
    //
    // Define a static function for every ElementWiseOperator (CommonMatrix.h).
    // -----------------------------------------------------------------------

+#pragma push_macro("DefNullaryOp")
+    #define DefNullaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op() { return expr; }
+
+    DefNullaryOp(ConstOne, 1);
+#pragma pop_macro("DefNullaryOp")
+
 #pragma push_macro("DefUnaryOp")
    #define DefUnaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a) { return expr; }

    DefUnaryOp(Copy, a);
    DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a);
    DefUnaryOp(Abs, fabs_(a));
-    DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a));
+    DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, ClippedLog(a)); DefUnaryOp(LinearRectifier, a > 0 ? a : 0); DefUnaryOp(Cosine, cos_(a));
 #pragma pop_macro("DefUnaryOp")

-    // parameterized unary ops
-    //DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha);
-
 #pragma push_macro("DefBinaryOp")
    #define DefBinaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b) { return expr; }
+    //#define DefBinaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(const ElemType & a, ElemType b, int i = 0) { UNUSED(i); return expr; }

-    DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementwiseProduct, a * b); DefBinaryOp(ElementwiseQuotient, a / b);
+    DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementwiseProduct, a * b); DefBinaryOp(ElementwiseQuotient, ClippedQuotient(a, b));
    DefBinaryOp(LogSum, LogAdd(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b);
    DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b);
+    DefBinaryOp(And, (float)((!!a) && (!!b))); DefBinaryOp(Or, (float)((!!a) || (!!b))); DefBinaryOp(Xor, (float)((!!a) ^ (!!b)));
    DefBinaryOp(MaskNegative, b >= 0 ? a : 0);
+    DefBinaryOp(ElementwiseProductWithSigmoidDerivativeFromOutput,         a * (b * (1 - b)));  // b = output
+    DefBinaryOp(ElementwiseProductWithTanhDerivativeFromOutput,            a * (1 - b * b));
+    DefBinaryOp(ElementwiseProductWithLinearRectifierDerivativeFromOutput, b > 0 ? a : 0);
+    DefBinaryOp(ElementwiseProductWithLogDerivativeFromOutput,             a * exp_(-b));
+    DefBinaryOp(ElementwiseProductWithCosDerivative,                       a * -sin_(b));       // note: b = input for cos()
+    //DefBinaryOp(Index, IndexElement(a, b, i));  // note: this one uses the third argument
+
 #pragma pop_macro("DefBinaryOp")

 #pragma push_macro("DefTernaryOp")
    #define DefTernaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; }

-    DefTernaryOp(Cond, a ? b : c);
+    DefTernaryOp(Cond, a ? b : c); DefTernaryOp(Clip, a < b ? b : (a > c ? c : a));
 #pragma pop_macro("DefTernaryOp")

 }}}
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@ -223,6 +223,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            offsets[i] = shapes[i].GetOffset();
    }

+    // enforce that in case of broadcasting, the output must not be an input
+    template<class ElemType>
+    static bool CheckDifferentObject(const TensorView<ElemType> & a, const TensorView<ElemType> & b)
+    {
+        if (&a == &b)
+            LogicError("Do{U,Bi,Ter}naryOpOf: When inverse broadcasting, output must not be an input.");
+        return true;
+    }
+
    template<class ElemType>
    void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
    {
@ -235,6 +244,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        SmallVector<size_t> regularOpDims, reducingOpDims;
        PrepareTensorOperands<ElemType,2>(array<TensorShape, 2> { a.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);

+        // output cannot be input when reducing
+        if (reducingOpDims.size() > 0)
+            CheckDifferentObject(a, *this);
+
        // now perform the operation
        GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }
@ -250,6 +263,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        SmallVector<size_t> regularOpDims, reducingOpDims;
        PrepareTensorOperands<ElemType, 3>(array<TensorShape, 3> { a.GetShape(), b.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);

+        // output cannot be input when reducing
+        if (reducingOpDims.size() > 0)
+            CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this);
+
        GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }

@ -264,6 +281,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        SmallVector<size_t> regularOpDims, reducingOpDims;
        PrepareTensorOperands<ElemType, 4>(array<TensorShape, 4> { a.GetShape(), b.GetShape(), c.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);

+        // output cannot be input when reducing
+        if (reducingOpDims.size() > 0)
+            CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this) && CheckDifferentObject(c, *this);
+
        GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }

--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@ -10,7 +10,7 @@

 #include "Basics.h"
 #include "Matrix.h"
-#include "DataTensor.h"
+#include "TensorShape.h"

 #pragma warning (push)
 #pragma warning (disable: 4251) // needs to have dll-interface to be used by clients of... caused by TensorView::m_shape which is only private. We use the same compiler everywhere.
@ -48,7 +48,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //      c.AssignDiffOf(c,a) means c -= a,
        //  and c.AddElementwiseProductOf(a, b, 1) means c += a .* b.
        // All operators support elementwise in-place operations, i.e. a, b, and c
-        // may all reference the same underlying SOB.
+        // may all reference the same underlying SOB, with onee exception:
+        // The output cannot be in-place and inverse-broadcasting at the same time.
+        // E.g. with c=[10] and a=[10 x 20], c.AssignDiffOf(c,a) will fail.
+        // In that case, you can use c.AddCopyOf(a,-1).
+        // Aliasing is not detected, so don't pass distinct TensorView objects that
+        // reference overlapping but not identical slices.
        // If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
        // -------------------------------------------------------------------

@ -59,7 +64,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void    Add ## oper ## Of(               const TensorView & a, ElemType alpha = 1.0f) { DoUnaryOpOf(1.0f, a, alpha, ElementWiseOperator::op ## oper); }

        ForAllUnaryOps(DeclareUnaryTensorOp);
-        ForAllParameterizedUnaryOps(DeclareUnaryTensorOp);
 #pragma pop_macro("DeclareUnaryTensorOp")

 #pragma push_macro("DeclareBinaryTensorOp")
@ -82,12 +86,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        static void Test();

-    private:
-
        void DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op);
        void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op);
        void DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op);

+    private:
+
        // -------------------------------------------------------------------
        // accessors
        // -------------------------------------------------------------------
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -2593,6 +2593,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // the total number of epochs to run.
        m_maxEpochs = configSGD(L"maxEpochs");

+        // Note: Momentum is best specified as a MB-size agnostic fashion.
+        // Because momentum per sample is a number very close to 1, it is more handy to use a logarithmic specification.
+        // We use 'momentumAsTimeConstant' to specify the time constant of the low-pass filter that momentum really is.
+        // To convert a typical per-MB momentum value of 'm' used with a MB size of 'N', use momentumAsTimeConstant = -N/ln(m).
+        // For the common configuration of momentum 0.9 at MB size of 256, that is momentumAsTimeConstant = 2429.8.
        floatargvector momentumPerMB          = configSGD(L"momentumPerMB", ConfigRecordType::Array(floatargvector()));
        floatargvector momentumPerSample      = configSGD(L"momentumPerSample", ConfigRecordType::Array(floatargvector()));
        floatargvector momentumAsTimeConstant = configSGD(L"momentumAsTimeConstant", ConfigRecordType::Array(floatargvector()));
--- a/Source/SGDLib/SGDLib.vcxproj
+++ b/Source/SGDLib/SGDLib.vcxproj
@ -156,7 +156,7 @@
    <ClInclude Include="..\Common\Include\BestGpu.h" />
    <ClInclude Include="..\Common\Include\Config.h" />
    <ClInclude Include="..\Common\Include\DataReader.h" />
-    <ClInclude Include="..\Common\Include\DataTensor.h" />
+    <ClInclude Include="..\Common\Include\TensorShape.h" />
    <ClInclude Include="..\Common\Include\DataWriter.h" />
    <ClInclude Include="..\Common\Include\File.h" />
    <ClInclude Include="..\Common\Include\fileutil.h" />
--- a/Source/SGDLib/SGDLib.vcxproj.filters
+++ b/Source/SGDLib/SGDLib.vcxproj.filters
@ -141,7 +141,7 @@
    <ClInclude Include="..\Common\Include\Sequences.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
-    <ClInclude Include="..\Common\Include\DataTensor.h">
+    <ClInclude Include="..\Common\Include\TensorShape.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
    <ClInclude Include="..\Common\Include\Config.h">
@ -195,4 +195,4 @@
      <UniqueIdentifier>{ae1eea3c-d77f-46ec-bf4f-1cd093a295e8}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
-</Project>
+</Project>
--- a/Tests/EndToEndTests/Image/QuickE2E/Convolution.ndl
+++ b/Tests/EndToEndTests/Image/QuickE2E/Convolution.ndl
@ -6,7 +6,7 @@ ndlMnistMacros = [
    ImageH = 28
    LabelDim = 10

-    features = ImageInput(ImageW, ImageH, 1, tag="feature")
+    features = ImageInput(ImageW, ImageH, 1, imageLayout="legacy", tag="feature")
    featScale = Const(0.00390625)
    featScaled = Scale(featScale, features)
    labels = Input(LabelDim, tag="label")
@ -28,7 +28,7 @@ DNN=[
    pool1H = 2
    pool1hStride = 2
    pool1vStride = 2
-    pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride)
+    pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout="legacy")

    # conv2
    kW2 = 5
@ -45,7 +45,7 @@ DNN=[
    pool2H = 2
    pool2hStride = 2
    pool2vStride = 2
-    pool2 = AveragePooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride)
+    pool2 = AveragePooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout="legacy")

    h1Dim = 128
    # DNNSigmoidLayer and DNNLayer are defined in Macros.ndl
--- a/Tests/EndToEndTests/Image/QuickE2E/Macros.ndl
+++ b/Tests/EndToEndTests/Image/QuickE2E/Macros.ndl
@ -1,3 +1,4 @@
+# Sigmoid non-linearity
 DNNSigmoidLayer(inDim, outDim, x, parmScale) = [
    W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale) 
    b = Parameter(outDim, 1,     init="uniform", initValueScale=parmScale) 
@ -6,6 +7,7 @@ DNNSigmoidLayer(inDim, outDim, x, parmScale) = [
    y = Sigmoid(z)
 ]

+# no non-linearity, as input for SoftMax
 DNNLayer(inDim, outDim, x, parmScale) = [
    W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale)
    b = Parameter(outDim, 1,     init="uniform", initValueScale=parmScale)
@ -13,10 +15,11 @@ DNNLayer(inDim, outDim, x, parmScale) = [
    z = Plus(t, b)
 ]

+# ReLU non-linearity
 ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
    convW = Parameter(outMap, inWCount, init="uniform", initValueScale=wScale)
    conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false)
-    convB = Parameter(outMap, 1,        init="fixedValue", value=bValue)
+    convB = ImageParameter(1, 1, outMap, imageLayout="legacy", init="fixedValue", value=bValue)
    convPlusB = Plus(conv, convB);
    act = RectifiedLinear(convPlusB);
 ]
--- a/Tests/EndToEndTests/Image/QuickE2E/cntk.config
+++ b/Tests/EndToEndTests/Image/QuickE2E/cntk.config
@ -1,7 +1,10 @@
+#precision = "double"
 precision = "float"
 command = train:test
 deviceId = $DeviceId$

+useCuDnn = true     # can be overridden by the command line
+
 ndlMacros = "$ConfigDir$/Macros.ndl"

 parallelTrain = false
@ -13,8 +16,94 @@ train = [
    #deviceId = $DeviceId$
    traceLevel = 1
    
-    NDLNetworkBuilder = [
-        networkDescription = "$ConfigDir$/Convolution.ndl"
+    #NDLNetworkBuilder = [
+    #    networkDescription = "$ConfigDir$/Convolution.ndl"
+    #]
+    
+    BrainScriptNetworkBuilder = [
+
+        useCuDnn = $useCuDnn$
+
+        // HACK to enforce same evaluation order or LearnableParameters as for NDL, as to get same radomization
+        // Nodes are evaluated in sorting order.
+        A1 = conv1_act; A2 = conv2_act; A3 = h1 ; A5 = ol
+
+        // macros
+        ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [  // ReLU non-linearity
+            convW = Parameter(outMap, inWCount, init="uniform", initValueScale=wScale, initOnCPUOnly=false)
+            conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false, imageLayout=if useCuDnn then "cudnn" else "legacy")
+            convB = if useCuDnn
+                    then ParameterTensor((1 : 1 : outMap : 1/*col dim*/), init="fixedValue", value=bValue)
+                    else Parameter(outMap, 1,                             init="fixedValue", value=bValue)
+            convPlusB = Plus(conv, convB);
+            out = RectifiedLinear(convPlusB);
+        ]
+
+        DNNSigmoidLayer(inDim, outDim, x, parmScale) = [        // Sigmoid non-linearity
+            W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=false) 
+            b = Parameter(outDim, 1,     init="uniform", initValueScale=parmScale, initOnCPUOnly=false) 
+            t = Times(W, x)
+            z = Plus(t, b)
+            out = Sigmoid(z)
+        ]
+
+        DNNLayer(inDim, outDim, x, parmScale) = [               //no non-linearity, as input for SoftMax
+            W = Parameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=false)
+            b = Parameter(outDim, 1,     init="uniform", initValueScale=parmScale, initOnCPUOnly=false)
+            t = Times(W, x)
+            out = Plus(t, b)
+        ]
+
+        imageW = 28
+        imageH = 28
+        labelDim = 10
+
+        features = ImageInput(imageW, imageH, 1, imageLayout=if useCuDnn then "cudnn" else "legacy", tag="feature")
+        featScale = Constant(0.00390625)
+        featScaled = Scale(featScale, features)
+        labels = Input(labelDim, tag="label")
+
+        # conv1
+        kW1 = 5
+        kH1 = 5
+        cMap1 = 16
+        hStride1 = 1
+        vStride1 = 1
+        # weight[cMap1, kW1 * kH1 * inputChannels]
+        conv1_act = ConvReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, 10, 1).out
+
+        # pool1
+        pool1W = 2
+        pool1H = 2
+        pool1hStride = 2
+        pool1vStride = 2
+        pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=if useCuDnn then "cudnn" else "legacy")
+
+        # conv2
+        kW2 = 5
+        kH2 = 5
+        cMap2 = 32
+        hStride2 = 1
+        vStride2 = 1
+        # weight[cMap2, kW2 * kH2 * cMap1]
+        # ConvReLULayer is defined in Macros.ndl
+        conv2_act = ConvReLULayer(pool1, cMap2, 400, kW2, kH2, hStride2, vStride2, 10, 1).out
+
+        # pool2
+        pool2W = 2
+        pool2H = 2
+        pool2hStride = 2
+        pool2vStride = 2
+        pool2 = AveragePooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout=if useCuDnn then "cudnn" else "legacy")
+
+        h1Dim = 128
+        # DNNSigmoidLayer and DNNLayer are defined in Macros.ndl
+        h1 = DNNSigmoidLayer(512, h1Dim, pool2, 1).out
+        ol = DNNLayer(h1Dim, labelDim, h1, 1).out
+
+        ce = CrossEntropyWithSoftmax(labels, ol, tag="criterion")
+        err = ErrorPrediction(labels, ol, tag="eval")
+        outputNodes = ol
    ]

    SGD = [
--- a/Tests/EndToEndTests/Speech/LSTM/cntk.config
+++ b/Tests/EndToEndTests/Speech/LSTM/cntk.config
@ -66,8 +66,8 @@ speechTrain = [
            C(c) = DiagTimes(WeightParam(cellDim, 1), Stabilize(c)) // cell-to-hiddden

            // LSTM cell
-            dh = PastValue(outputDim, 1, output);                   // hidden state(t-1)
-            dc = PastValue(cellDim, 1, ct);                         // cell(t-1)
+            dh = PastValue(outputDim, output);                   // hidden state(t-1)
+            dc = PastValue(cellDim, ct);                         // cell(t-1)

            // note: the W(inputx) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B()
            it = Sigmoid(W(inputx) + B() + H(dh) + C(dc))       // input gate(t)
@ -95,8 +95,8 @@ speechTrain = [
        numLSTMs = 3        // number of hidden LSTM model layers

        // features
-        features = Input(featDim, 1, tag='feature')
-        labels = Input(labelDim, 1, tag='label')
+        features = Input(featDim, tag='feature')
+        labels = Input(labelDim, tag='label')
        feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);      # shift 5 frames right (x_{t+5} -> x_{t} )  // TODO why 5? Where do I see this?

        featNorm = MeanVarNorm(feashift)
--- a/Tests/EndToEndTests/Speech/LSTM/lstm.bs
+++ b/Tests/EndToEndTests/Speech/LSTM/lstm.bs
@ -74,8 +74,8 @@ speechTrain = new TrainAction [
            C(c) = DiagTimes(WeightParam(cellDim, 1), Stabilize(c)) // cell-to-hiddden

            // LSTM cell
-            dh = PastValue(outputDim, 1, output);                   // hidden state(t-1)
-            dc = PastValue(cellDim, 1, ct);                         // cell(t-1)
+            dh = PastValue(outputDim, output);                   // hidden state(t-1)
+            dc = PastValue(cellDim, ct);                         // cell(t-1)

            // note: the W(inputx) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B()
            it = Sigmoid(W(inputx) + B() + H(dh) + C(dc))       // input gate(t)
--- a/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt
+++ b/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt
@ -27,6 +27,8 @@ Using parallel sequences (difference to above: nbruttsineachrecurrentiter=4). No

 COMMAND:     currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data  configFile=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM\cntk.config  stderr=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log  RunDir=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance  NdlDir=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM  DataDir=.  DeviceId=auto  Truncated=false  speechTrain=[reader=[nbruttsineachrecurrentiter=4]]  speechTrain=[SGD=[epochSize=2560]]  speechTrain=[SGD=[learningRatesPerMB=0.125]]  speechTrain=[SGD=[maxEpochs=2]]  speechTrain=[SGD=[numMBsToShowResult=1]]  makeMode=false

+Linux:      bin/cntk  currentDirectory=Tests/EndToEndTests/Speech/Data  configFile=../LSTM/cntk.config  stderr=../RunDir/LSTM/Truncated/models/cntkSpeech.dnn.log  RunDir=../RunDir/LSTM/Truncated  NdlDir=../LSTM  DataDir=.  DeviceId=auto  Truncated=false  'speechTrain=[reader=[nbruttsineachrecurrentiter=4]]'  'speechTrain=[SGD=[epochSize=2560]]'  'speechTrain=[SGD=[learningRatesPerMB=0.125]]'  'speechTrain=[SGD=[maxEpochs=2]]'  'speechTrain=[SGD=[numMBsToShowResult=1]]'  makeMode=false
+
 Using full BrainScript configuration

 COMMAND:     --cd $(SolutionDir)Tests\EndToEndTests\Speech\Data  -f $(SolutionDir)Tests\EndToEndTests\Speech\LSTM\lstm.bs  -D stderr='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log'  -D RunDir='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance'  -D NdlDir='$(SolutionDir)Tests\EndToEndTests\Speech\LSTM'  -D DataDir='.'  -D DeviceId='Auto'  -D Truncated=false  -D speechTrain=[reader=[nbruttsineachrecurrentiter=1];SGD=[epochSize=2560;maxEpochs=2;numMBsToShowResult=1]]  -D makeMode=false
@ -46,7 +48,7 @@ COMMAND:     currentDirectory=$(SolutionDir)ExampleSetups\Image\MNIST  configFil

 --- Image/QuickE2E:

-COMMAND:     configFile=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E\cntk.config  RunDir=$(SolutionDir)Tests\EndToEndTests\Image\_run  DataDir=$(SolutionDir)Tests\EndToEndTests\Image\Data  ConfigDir=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E  stderr=$(SolutionDir)Tests\EndToEndTests\RunDir\Image\QuickE2E\models\cntkImage.dnn.log  DeviceId=0   makeMode=false
+COMMAND:     configFile=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E\cntk.config  RunDir=$(SolutionDir)Tests\EndToEndTests\Image\_run  DataDir=$(SolutionDir)Tests\EndToEndTests\Image\Data  ConfigDir=$(SolutionDir)Tests\EndToEndTests\Image\QuickE2E  stderr=$(SolutionDir)Tests\EndToEndTests\RunDir\Image\QuickE2E\models\cntkImage.dnn.log  DeviceId=0  useCuDnn=false   makeMode=false

 Simple test
 -----------
--- a/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
+++ b/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
@ -24,14 +24,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

    static bool IsCuDnnSupported()
    {
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
        try
        {
-            return ConvFact::Create(0, ConvFact::EngineType::CuDnn) != nullptr;
+            // TODO: Will this ever return nullptr?
+            return ConvFact::Create(0, ConvFact::EngineType::CuDnn, ImageLayoutKind::CHW) != nullptr;
        }
        catch (std::runtime_error)
        {
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
            return false;
        }
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
    }

    BOOST_AUTO_TEST_SUITE(ConvolutionSuite)
@ -55,7 +59,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            // BUGBUG: These will fail depending on whether we built with cuDNN or not. Without cuDNN we should use HWC
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
            auto tt = typeid(fact).name();
            UNUSED(tt);
            auto eng = fact->CreateConvEngine(deviceId, 0);
@ -128,14 +133,22 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { -1, 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, deviceId >= 0 ? ImageLayoutKind::CHW : ImageLayoutKind::HWC);
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
            auto eng = fact->CreateConvEngine(deviceId, 0);
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
            auto inT = fact->CreateTensor(inW, inH, cmapIn, n);
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
            auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
            auto outT = fact->CreateTensor(outW, outH, cmapOut, n);
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
            auto convT = fact->CreateConvDescriptor(*inT, *filtT, sW, sH, pad);
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);

            // Input in NCHW format.
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
            SingleMatrix in(inW * inH * cmapIn, n, vec(inW * inH * cmapIn * n, 1.0f).data(), matrixFlagNormal, deviceId);
            // Create cmapOut filters, each kW x kH x cmapIn (NCHW format).
            SingleMatrix filt(cmapOut, kW * kH * cmapIn, vec(kW * kH * cmapIn * cmapOut, 1.0f).data(), matrixFlagNormal, deviceId);
@ -143,7 +156,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test
            SingleMatrix out(outW * outH * cmapOut, n, deviceId);
            SingleMatrix temp(deviceId);

+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);
            eng->Forward(*inT, in, *filtT, filt, *convT, *outT, out, temp);
+fprintf(stderr, "ConvolutionEngineTests.cpp %d\n", __LINE__);

            // Output is in NCHW format.
            float expBuf[] = {
@ -175,7 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
            auto eng = fact->CreateConvEngine(deviceId, 0);
            auto srcGradT = fact->CreateTensor(outW, outH, cmapOut, n);
            auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
@ -231,7 +246,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
            auto eng = fact->CreateConvEngine(deviceId, 0);
            auto srcGradT = fact->CreateTensor(outW, outH, cmapOut, n);
            auto filtT = fact->CreateFilter(kW, kH, cmapIn, cmapOut);
@ -296,7 +311,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
            auto eng = fact->CreatePoolEngine(deviceId);
            auto inT = fact->CreateTensor(inW, inH, cmap, n);
            auto outT = fact->CreateTensor(outW, outH, cmap, n);
@ -346,7 +361,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
            auto eng = fact->CreatePoolEngine(deviceId);
            auto inT = fact->CreateTensor(inW, inH, cmap, n);
            auto outT = fact->CreateTensor(outW, outH, cmap, n);
@ -406,7 +421,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
            auto eng = fact->CreatePoolEngine(deviceId);
            auto inT = fact->CreateTensor(inW, inH, cmap, n);
            auto outT = fact->CreateTensor(outW, outH, cmap, n);
@ -456,7 +471,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test

        for (int deviceId : { 0 })
        {
-            auto fact = ConvFact::Create(deviceId);
+            auto fact = ConvFact::Create(deviceId, ConvFact::EngineType::Auto, ImageLayoutKind::CHW);
            auto eng = fact->CreatePoolEngine(deviceId);
            auto inT = fact->CreateTensor(inW, inH, cmap, n);
            auto outT = fact->CreateTensor(outW, outH, cmap, n);
--- a/Tests/UnitTests/MathTests/GPUMatrixTests.cpp
+++ b/Tests/UnitTests/MathTests/GPUMatrixTests.cpp
@ -535,6 +535,19 @@ namespace Microsoft
                    BOOST_CHECK(m1.IsEqualTo(m2));
                }

+#if 0 // Temporarily disabling
+                BOOST_FIXTURE_TEST_CASE(GPUMatrixLargeInequality, RandomSeedFixture)
+                {
+                    const int rows = 33553921;
+                    const int cols = 1;
+
+                    auto m0 = GPUMatrix<float>::Zeros(rows, cols, c_deviceIdZero);
+                    auto m1 = GPUMatrix<float>::Ones(rows, cols, c_deviceIdZero);
+
+                    BOOST_CHECK(!m1.IsEqualTo(m0, c_epsilonFloatE5));
+                }
+#endif
+
                BOOST_AUTO_TEST_SUITE_END()
            }
        }
--- a/Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp
+++ b/Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp
@ -493,34 +493,22 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrix1DConvolutionRandomInit, RandomSeedFixtu
    }
 }

-#if 0 // Temporarily disabling
-BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrixLargeIsEqual, RandomSeedFixture)
-{
-    const int rows = 33553921;
-    const int cols = 1;
-
-    Matrix<float> m0 = Matrix<float>::Zeros(rows, cols, c_deviceIdZero);
-    Matrix<float> m1 = Matrix<float>::Ones(rows, cols, c_deviceIdZero);
-
-    BOOST_CHECK(!m1.IsEqualTo(m0, c_epsilonFloatE5));
-}
-
 BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrix1DConvolutionBackprop, RandomSeedFixture)
 {
-    const int inChannels = 2;// 50;
-    const int inWidth = 4;// 10;
+    const int inChannels = 50;
+    const int inWidth = 10;
    const int inHeight = 1;
-    const int batchSize = 3;// 20;
-    const int kernelWidth = 2;// 3;
+    const int batchSize = 20;
+    const int kernelWidth = 3;
    const int kernelHeight = inHeight;
    const int horizontalSubsample = 1;
    const int verticalSubsample = 1;
    const bool zeroPadding = false;
-    const int outChannels = 2;// 3;
-    const int outWidth = zeroPadding ? inWidth : (inWidth >= kernelWidth ? 1 + (inWidth - kernelWidth) / horizontalSubsample : 0);
+    const int outChannels = 3;
+    const int outWidth = zeroPadding ? (inWidth / horizontalSubsample) : (inWidth >= kernelWidth ? 1 + (inWidth - kernelWidth) / horizontalSubsample : 0);
    const int outHeight = inHeight;
-    const float randomInitLowerBound = 1.0f;
-    const float randomInitUpperBound = 5.0f;
+    const float randomInitLowerBound = -1.0f;
+    const float randomInitUpperBound = 1.0f;
    Matrix<float> outputGradientSubBatch = Matrix<float>::RandomUniform(outChannels, batchSize*outWidth, randomInitLowerBound, randomInitUpperBound, IncrementCounter(), c_deviceIdZero);
    Matrix<float> inputSubBatch = Matrix<float>::RandomUniform(inChannels*inWidth, batchSize, randomInitLowerBound, randomInitUpperBound, IncrementCounter(), c_deviceIdZero);
    Matrix<float> tempMatrix(1, 1, c_deviceIdZero);
@ -550,30 +538,8 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrix1DConvolutionBackprop, RandomSeedFixture
    Matrix<float>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, inputGradientValues2, batchSize, horizontalSubsample, zeroPadding, false);
    inputGradientValues2.Reshape(outChannels, inChannels*kernelWidth);

-    const int dim = outChannels*inChannels*kernelWidth;
-    float* base = inputGradientValues1.CopyToArray();
-    float baseA[dim];
-    fprintf(stderr, "[BASE]");
-    for (int i = 0; i < dim; i++)
-    {
-        baseA[i] = base[i];
-        fprintf(stderr, "%f ", baseA[i]);
-    }
-    fprintf(stderr, "\n");
-
-    float* exp = inputGradientValues2.CopyToArray();
-    float expA[dim];
-    fprintf(stderr, "[EXP]");
-    for (int i = 0; i < dim; i++)
-    {
-        expA[i] = exp[i];
-        fprintf(stderr, "%f ", expA[i]);
-    }
-    fprintf(stderr, "\n");
-
-    BOOST_CHECK(inputGradientValues2.IsEqualTo(inputGradientValues1, c_epsilonFloatE5));
+    BOOST_CHECK(inputGradientValues2.IsEqualTo(inputGradientValues1, c_epsilonFloatE2));
 }
-#endif

 BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrixReshape, RandomSeedFixture)
 {
@ -595,10 +561,10 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseMatrixReshape, RandomSeedFixture)
    BOOST_CHECK(denseMatrixC.IsEqualTo(denseMatrixB, c_epsilonFloatE5));
    BOOST_CHECK(!denseMatrixC.IsEqualTo(denseMatrixA, c_epsilonFloatE5));
 }
-#if 0
+
 BOOST_FIXTURE_TEST_CASE(GPUSSparseTensorShuffleScaleAndAdd, RandomSeedFixture)
 {
-    size_t D = 10, S = 10, M = 10, K = 10, T = 10;
+    size_t D = 13, S = 11, M = 7, K = 15, T = 8;
    GPUMatrix<float> denseMatrixA = GPUMatrix<float>::RandomUniform(D * S * M * K, T, c_deviceIdZero, -1, 1, IncrementCounter());
    GPUMatrix<float> denseMatrixB(D*S*M*K, T, c_deviceIdZero);
    GPUMatrix<float> denseMatrixC(D*S*M*K, T, c_deviceIdZero);
@ -612,7 +578,7 @@ BOOST_FIXTURE_TEST_CASE(GPUSSparseTensorShuffleScaleAndAdd, RandomSeedFixture)

    BOOST_CHECK(denseMatrixC.IsEqualTo(denseMatrixB, c_epsilonFloatE5));
 }
-#endif
+
 BOOST_AUTO_TEST_SUITE_END()
 } } } }