CNTK v2 library: a) Convolution b) Mean variance normalization c) Added Cifar resnet test

2016-07-30 16:16:41 -07:00 · 2016-07-30 16:16:41 -07:00 · 0854708400
--- a/1
+++ b/1
@ -417,6 +417,7 @@ CNTKLIBRARY_TESTS_SRC =\
 	Tests/UnitTests/V2LibraryTests/RecurrentFunctionTests.cpp \
 	Tests/UnitTests/V2LibraryTests/TensorTests.cpp \
 	Tests/UnitTests/V2LibraryTests/TrainerTests.cpp \
+	Tests/UnitTests/V2LibraryTests/CifarResNet.cpp \

 CNTKLIBRARY_TESTS:=$(BINDIR)/v2librarytests
 CNTKLIBRARY_TESTS_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_TESTS_SRC)))
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -784,14 +784,21 @@ namespace CNTK
        ///
        /// Create an 'Input' Variable.
        ///
-        Variable(const NDShape& shape, CNTK::DataType dataType, const wchar_t* name = L"")
+        Variable(const NDShape& shape, CNTK::DataType dataType)
+            : Variable(shape, dataType, L"")
+        {}
+
+        ///
+        /// Create an 'Input' Variable.
+        ///
+        Variable(const NDShape& shape, CNTK::DataType dataType, const wchar_t* name)
            : Variable(shape, dataType, std::wstring(name))
        {}

        ///
        /// Create an 'Input' Variable.
        ///
-        Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name = L"")
+        Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name)
            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, false, { Axis::DefaultDynamicAxis() }, false, name)
        {}

@ -1488,7 +1495,7 @@ namespace CNTK
    /// Create an instance of the CNTK built-in matrix multiplication operation with the specified input operands.
    /// TODO: Specify the constraints on the shapes of the operands.
    ///
-    CNTK_API FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
+    CNTK_API FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes = 1, const std::wstring& name = L"");

    ///
    /// Create an instance of the CNTK built-in operation to compute squared-error for specified input operands.
@ -1525,6 +1532,61 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name = L"");

+    ///
+    /// Per dimension mean-variance normalization of the specified input operand.
+    ///
+    CNTK_API FunctionPtr PerDimMeanVarianceNormalize(const Variable& operand, const NDArrayViewPtr& mean, const NDArrayViewPtr& invStdDev, const std::wstring& name = L"");
+
+    ///
+    /// TODO:
+    ///
+    CNTK_API FunctionPtr Convolution(const Variable& convolutionMap,
+                                     const Variable& operand,
+                                     const NDShape& strides = {1},
+                                     const std::vector<bool>& sharing = {true},
+                                     const std::vector<bool>& autoPadding = {true},
+                                     const NDShape& lowerPad = {0},
+                                     const NDShape& upperPad = {0},
+                                     bool transpose = false,
+                                     size_t maxTempMemSizeInSamples = 0,
+                                     const std::wstring& name = L"");
+
+    ///
+    /// TODO:
+    ///
+    enum class PoolingType
+    {
+        Max,
+        Average,
+    };
+
+    ///
+    /// TODO:
+    ///
+    CNTK_API FunctionPtr Pooling(const Variable& operand,
+                                 PoolingType poolingType,
+                                 const NDShape& poolingWindowShape,
+                                 const NDShape& strides = {1},
+                                 const std::vector<bool>& autoPadding = {false},
+                                 const NDShape& lowerPad = {0},
+                                 const NDShape& upperPad = {0},
+                                 const std::wstring& name = L"");
+
+    ///
+    /// TODO:
+    ///
+    CNTK_API FunctionPtr BatchNormalization(const Variable& operand,
+                                            const Variable& scale,
+                                            const Variable& bias,
+                                            const Variable& runningMean,
+                                            const Variable& runningInvStd,
+                                            bool spacial,
+                                            double normalizationTimeConstant = 0,
+                                            double blendTimeConstant = 0,
+                                            double epsilon = 0.00001,
+                                            bool useCuDNNEngine = false,
+                                            const std::wstring& name = L"");
+
    ///
    /// Create a new Function instance which just combines the outputs of the specified list of 'operands' Functions such that the 'Outputs' of the 
    /// new 'Function' are union of the 'Outputs' of each of the specified 'operands' Functions.
@ -1629,9 +1691,9 @@ namespace CNTK
        DictionaryValue(const T& value) : m_valueType(GetValueType<T>())
        {
            static_assert(std::is_same<T, NDShape>::value ||
-                std::is_same<T, std::wstring>::value ||
-                std::is_same<T, std::vector<DictionaryValue>>::value ||
-                std::is_same<T, Dictionary>::value,
+                          std::is_same<T, std::wstring>::value ||
+                          std::is_same<T, std::vector<DictionaryValue>>::value ||
+                          std::is_same<T, Dictionary>::value,
                          "Unsupported ValueType");

            AllocateDataPtr(value);
@ -1730,10 +1792,10 @@ namespace CNTK
                          std::is_same<T, size_t>::value ||
                          std::is_same<T, float>::value ||
                          std::is_same<T, double>::value ||
-                std::is_same<T, std::wstring>::value ||
+                          std::is_same<T, std::wstring>::value ||
                          std::is_same<T, NDShape>::value ||
-                std::is_same<T, std::vector<DictionaryValue>>::value ||
-                std::is_same<T, Dictionary>::value,
+                          std::is_same<T, std::vector<DictionaryValue>>::value ||
+                          std::is_same<T, Dictionary>::value,
                          "Unsupported ValueType");

            if (std::is_same<T, bool>::value)                                      return Type::Bool;
@ -1973,7 +2035,11 @@ namespace CNTK

    inline bool operator==(const StreamInfo& left, const StreamInfo& right)
    {
-        return (left.m_id == right.m_id);
+        return ((left.m_id == right.m_id) &&
+                (left.m_name == right.m_name) &&
+                (left.m_storageFormat == right.m_storageFormat) &&
+                (left.m_elementType == right.m_elementType) &&
+                (left.m_sampleLayout == right.m_sampleLayout));
    }
 }

@ -1989,6 +2055,13 @@ namespace std {

 namespace CNTK
 {
+    struct MinibatchData
+    {
+        size_t m_numSequences;
+        size_t m_numSamples;
+        ValuePtr m_data;
+    };
+
    ///
    /// Abstraction for generating minbatches of samples for training/evaluation.
    ///
@ -2002,10 +2075,14 @@ namespace CNTK

        ///
        /// Reads a minibatch that contains data across all input streams.
-        /// The minibatchData argument specifies the desired minibatch size for each stream of the reader and the actual returned size is the min across all streams.
-        /// The return value of false indciates that the reader will no longer return any further data.
+        /// The minibatchData argument specifies the desired minibatch size for each stream of the reader either in terms of #sequences or 
+        /// #samples or both. In case the size is specified in terms of both #sequences and #samples, the smaller of the 2 is taken. The actual
+        /// returned size of the minibatch is the min across all streams. Also the requested MB size fields in the maps are updated by the 
+        /// MinibatchSource to contain the actual #sequences and #samples in the returned minibatch for the corresponding stream.
+        /// The return value indciates if the MinibatchSource will return any further data in subsequent calls of this function.
        ///
-        virtual bool GetNextMinibatch(std::unordered_map<StreamInfo, std::pair<size_t, ValuePtr>>& minibatchData) = 0;
+        virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+                                                                               const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) = 0;

        // TODO: Methods to save and restore from checkpoints

@ -2020,4 +2097,11 @@ namespace CNTK
    /// Instantiate the CNTK built-in composite minibatch source.
    ///
    CNTK_API MinibatchSourcePtr CreateCompositeMinibatchSource(const Dictionary& configuration);
+
+    ///
+    /// Compute the per dimension means and variances for each of the specified streams using data from the specified minibatchSource.
+    ///
+    CNTK_API void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
+                                                       std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndVariances,
+                                                       const DeviceDescriptor& device = DeviceDescriptor::CPUDevice());
 }
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -71,10 +71,14 @@ namespace CNTK
            }
            else if (node->Is<LearnableParameter<ElementType>>())
            {
+                bool isConstant = (node->GetLearningRateMultiplier() == 0);
                auto& matrix = node->As<ComputationNode<ElementType>>()->Value();
                auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), node->GetSampleLayout());
                NDArrayViewPtr parameterValue = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), varShape, false, tensorView);
-                var = Parameter(parameterValue, node->GetName());
+                if (isConstant)
+                    var = Constant(parameterValue, node->GetName());
+                else
+                    var = Parameter(parameterValue, node->GetName());
            }
            else
                LogicError("CNTK::LoadLegacyModel: Unsupported legacy CNTK node named '%S'", node->NodeName().c_str());
@ -95,16 +99,51 @@ namespace CNTK

            PrimitiveOpType opType;
            Dictionary primitiveFunctionConfigParameters;
-            if (node->OperationName() == OperationNameOf(TanhNode))
-                opType = PrimitiveOpType::Tanh;
+            if (node->OperationName() == OperationNameOf(NegateNode))
+                opType = PrimitiveOpType::Negate;
            else if (node->OperationName() == OperationNameOf(SigmoidNode))
                opType = PrimitiveOpType::Sigmoid;
+            else if (node->OperationName() == OperationNameOf(TanhNode))
+                opType = PrimitiveOpType::Tanh;
+            else if (node->OperationName() == OperationNameOf(RectifiedLinearNode))
+                opType = PrimitiveOpType::ReLU;
            else if (node->OperationName() == OperationNameOf(ExpNode))
                opType = PrimitiveOpType::Exp;
-            else if (node->OperationName() == OperationNameOf(TimesNode))
-                opType = PrimitiveOpType::Times;
+            else if (node->OperationName() == OperationNameOf(LogNode))
+                opType = PrimitiveOpType::Log;
+            else if (node->OperationName() == OperationNameOf(SqrtNode))
+                opType = PrimitiveOpType::Sqrt;
+            else if (node->OperationName() == OperationNameOf(FloorNode))
+                opType = PrimitiveOpType::Floor;
+            else if (node->OperationName() == OperationNameOf(AbsNode))
+                opType = PrimitiveOpType::Abs;
+            else if (node->OperationName() == OperationNameOf(ReciprocalNode))
+                opType = PrimitiveOpType::Reciprocal;
+            else if (node->OperationName() == OperationNameOf(SoftmaxNode))
+                opType = PrimitiveOpType::Softmax;
            else if (node->OperationName() == OperationNameOf(PlusNode))
                opType = PrimitiveOpType::Plus;
+            else if (node->OperationName() == OperationNameOf(MinusNode))
+                opType = PrimitiveOpType::Minus;
+            else if (node->OperationName() == OperationNameOf(ElementTimesNode))
+                opType = PrimitiveOpType::ElementTimes;
+            else if (node->OperationName() == OperationNameOf(EqualNode))
+                opType = PrimitiveOpType::Equal;
+            else if (node->OperationName() == OperationNameOf(NotEqualNode))
+                opType = PrimitiveOpType::NotEqual;
+            else if (node->OperationName() == OperationNameOf(LessNode))
+                opType = PrimitiveOpType::Less;
+            else if (node->OperationName() == OperationNameOf(LessEqualNode))
+                opType = PrimitiveOpType::LessEqual;
+            else if (node->OperationName() == OperationNameOf(GreaterNode))
+                opType = PrimitiveOpType::Greater;
+            else if (node->OperationName() == OperationNameOf(GreaterEqualNode))
+                opType = PrimitiveOpType::GreaterEqual;
+            else if (node->OperationName() == OperationNameOf(TimesNode))
+            {
+                primitiveFunctionConfigParameters[L"numOutputAxes"] = DictionaryValue((size_t)node->As<TimesNode<ElementType>>()->OutputRank());
+                opType = PrimitiveOpType::Times;
+            }
            else if (node->OperationName() == OperationNameOf(PastValueNode))
            {
                if (inputVars.size() == 1)
@ -125,6 +164,8 @@ namespace CNTK
                primitiveFunctionConfigParameters[L"stepSize"] = DictionaryValue((size_t)node->As<FutureValueNode<ElementType>>()->TimeStep());
                opType = PrimitiveOpType::FutureValue;
            }
+            else if (node->OperationName() == OperationNameOf(SquareErrorNode))
+                opType = PrimitiveOpType::SquaredError;
            else if (node->OperationName() == OperationNameOf(CrossEntropyWithSoftmaxNode))
            {
                std::swap(inputVars[0], inputVars[1]);
@ -135,10 +176,44 @@ namespace CNTK
                std::swap(inputVars[0], inputVars[1]);
                opType = PrimitiveOpType::ClassificationError;
            }
-            else if (node->OperationName() == OperationNameOf(ElementTimesNode))
-                opType = PrimitiveOpType::ElementTimes;
            else if (node->OperationName() == OperationNameOf(SumElementsNode))
                opType = PrimitiveOpType::ReduceSum;
+            else if (node->OperationName() == OperationNameOf(ConvolutionNode))
+            {
+                auto convolutionNode = node->As<ConvolutionNode<ElementType>>();
+                primitiveFunctionConfigParameters[L"strides"] = AsNDShape(convolutionNode->Strides());
+                primitiveFunctionConfigParameters[L"sharing"] = AsDictionaryValueVector(convolutionNode->Sharing());
+                primitiveFunctionConfigParameters[L"autoPadding"] = AsDictionaryValueVector(convolutionNode->AutoPad());
+                primitiveFunctionConfigParameters[L"lowerPad"] = AsNDShape(convolutionNode->LowerPad());
+                primitiveFunctionConfigParameters[L"upperPad"] = AsNDShape(convolutionNode->UpperPad());
+                primitiveFunctionConfigParameters[L"transpose"] = convolutionNode->Transpose();
+                primitiveFunctionConfigParameters[L"maxTempMemSizeInSamples"] = convolutionNode->MaxTempMemSizeInSamples();
+
+                opType = PrimitiveOpType::Convolution;
+            }
+            else if (node->OperationName() == OperationNameOf(PoolingNode))
+            {
+                auto poolingNode = node->As<PoolingNode<ElementType>>();
+                primitiveFunctionConfigParameters[L"poolingType"] = (size_t)(AsPoolingType(poolingNode->PoolingKind()));
+                primitiveFunctionConfigParameters[L"poolingWindowShape"] = AsNDShape(poolingNode->KernelShape());
+                primitiveFunctionConfigParameters[L"strides"] = AsNDShape(poolingNode->Strides());
+                primitiveFunctionConfigParameters[L"autoPadding"] = AsDictionaryValueVector(poolingNode->AutoPad());
+                primitiveFunctionConfigParameters[L"lowerPad"] = AsNDShape(poolingNode->LowerPad());
+                primitiveFunctionConfigParameters[L"upperPad"] = AsNDShape(poolingNode->UpperPad());
+
+                opType = PrimitiveOpType::Pooling;
+            }
+            else if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
+            {
+                auto batchNormalizationNode = node->As<BatchNormalizationNode<ElementType>>();
+                primitiveFunctionConfigParameters[L"spacial"] = batchNormalizationNode->Spatial();
+                primitiveFunctionConfigParameters[L"normalizationTimeConstant"] = batchNormalizationNode->NormalizationTimeConstant();
+                primitiveFunctionConfigParameters[L"blendTimeConstant"] = batchNormalizationNode->BlendTimeConstant();
+                primitiveFunctionConfigParameters[L"epsilon"] = batchNormalizationNode->Epsilon();
+                primitiveFunctionConfigParameters[L"useCuDNNEngine"] = !batchNormalizationNode->UseCNTKEngine();
+
+                opType = PrimitiveOpType::BatchNormalization;
+            }
            else
                LogicError("Unsupported ComputationNode with OperationName='%S' found when loading legacy CNTK model", node->OperationName().c_str());

--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -170,6 +170,7 @@ namespace CNTK
        if (dynamic_cast<PrimitiveFunction*>(function))
        {
            PrimitiveFunction* primitiveFunction = dynamic_cast<PrimitiveFunction*>(function);
+            auto functionConfig = primitiveFunction->FunctionConfig();

            // Create the nodes corresponding to the inputs
            auto functionInputs = primitiveFunction->Inputs();
@ -222,6 +223,17 @@ namespace CNTK

                computationNodePtr = builder.Softmax(input0Node, function->Name());
                break;
+            case PrimitiveOpType::Pooling:
+            {
+                PoolingType poolingType = (PoolingType)(functionConfig[L"poolingType"].GetValue<size_t>());
+                auto poolingWindowsShape = functionConfig[L"poolingWindowShape"].GetValue<NDShape>();
+                auto strides = functionConfig[L"strides"].GetValue<NDShape>();
+                auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
+                auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
+                auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
+                computationNodePtr = builder.Pooling(input0Node, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape, true), AsTensorShape(strides, true), autoPadding, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true), ImageLayoutKind::CHW, function->Name());
+                break;
+            }
            case PrimitiveOpType::Plus:
                computationNodePtr = builder.Plus(input0Node, input1Node, function->Name());
                break;
@ -250,9 +262,25 @@ namespace CNTK
                computationNodePtr = builder.GreaterEqual(input0Node, input1Node, function->Name());
                break;
            case PrimitiveOpType::Times:
-                // TODO: The output rank of the times operation is currently hardcoded to 1
-                computationNodePtr = builder.Times(input0Node, input1Node, 1, function->Name());
+            {
+                size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
+                computationNodePtr = builder.Times(input0Node, input1Node, numOutputAxes, function->Name());
                break;
+            }
+            case PrimitiveOpType::Convolution:
+            {
+                NDShape outputMapCount, kernelShape;
+                std::tie(outputMapCount, kernelShape) = GetConvolutionOutputMapCountAndKernelShape(functionInputs[0].Shape(), functionInputs[1].Shape());
+                auto strides = functionConfig[L"strides"].GetValue<NDShape>();
+                auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
+                auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
+                auto sharing = AsBasicElementTypeVector<bool>(functionConfig[L"sharing"].GetValue<std::vector<DictionaryValue>>());
+                auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
+                auto transpose = functionConfig[L"transpose"].GetValue<bool>();
+                auto maxTempMemSizeInSamples = functionConfig[L"maxTempMemSizeInSamples"].GetValue<size_t>();
+                computationNodePtr = builder.Convolution(input0Node, input1Node, AsTensorShape(kernelShape, true), AsTensorShape(outputMapCount, true), AsTensorShape(strides, true), sharing, autoPadding, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true), transpose, ImageLayoutKind::CHW, maxTempMemSizeInSamples, function->Name());
+                break;
+            }
            case PrimitiveOpType::SquaredError:
                computationNodePtr = builder.SquareError(input0Node, input1Node, function->Name());
                break;
@ -298,6 +326,23 @@ namespace CNTK
                computationNodePtr = builder.Sum(input0Node, function->Name());
                break;
            }
+            case PrimitiveOpType::BatchNormalization:
+            {
+                auto spacial = functionConfig[L"spacial"].GetValue<bool>();
+                auto normalizationTimeConstant = functionConfig[L"normalizationTimeConstant"].GetValue<double>();
+                auto blendTimeConstant = functionConfig[L"blendTimeConstant"].GetValue<double>();
+                auto epsilon = functionConfig[L"epsilon"].GetValue<double>();
+                auto useCuDNNEngine = functionConfig[L"useCuDNNEngine"].GetValue<bool>();
+                std::vector<std::shared_ptr<ComputationNode<ElementType>>> inputNodes;
+                for (auto inputVar : functionInputs)
+                {
+                    auto baseNodePtr = GetNode(inputVar, network, builder, variableToNodeMap, isVariableRootMap);
+                    inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr->template As<ComputationNode<ElementType>>()->shared_from_this() : nullptr);
+                }
+
+                computationNodePtr = builder.BatchNormalization(inputNodes[0], inputNodes[1], inputNodes[2], inputNodes[3], inputNodes[4], spacial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, ImageLayoutKind::CHW, function->Name());
+                break;
+            }
            case PrimitiveOpType::Combine:
                // This operation is just a no-op and is a means to combine multiple functions to create a single Function
                // whose outputs are a union of tyhe outputs of the Functions being combined.
@ -408,7 +453,7 @@ namespace CNTK
                    auto outputShape = outputVar.Shape();
                    auto computationNodeSampleLayout = computationNodePtr->GetSampleLayout();
                    if (((outputShape.NumAxes() == 0) && (computationNodeSampleLayout[0] != 1)) ||
-                        ((outputShape.NumAxes() != 0) && (computationNodeSampleLayout != AsTensorShape(outputShape))))
+                        ((outputShape.NumAxes() != 0) && (computationNodeSampleLayout != AsTensorShape(outputShape)) && (computationNodeSampleLayout != AsTensorShape(outputShape, true))))
                    {
                        LogicError("The output Variable shape %s does not match the SampleLayout shape %s of the corresponding ComputationNode in the network", AsString(outputShape).c_str(), ((std::string)computationNodeSampleLayout).c_str());
                    }
@ -739,45 +784,48 @@ namespace CNTK
        return NDShape(outputShapeDims);
    }

+    /*static*/ void CompositeFunction::GetNodeOutputOrGradient(Variable var, ValuePtr& varValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode, bool getGradient)
+    {
+        auto valueShape = GetValueShape(var, computationNode);
+        if (varValue != nullptr)
+        {
+            // TODO: The shape of the specified output Value object must match the actual output shape
+            if (varValue->Data()->Shape() != valueShape)
+                InvalidArgument("The shape %s of the specified Value object for %s does not match the actual shape %s", AsString(varValue->Data()->Shape()).c_str(), getGradient ? "gradient" : "output", AsString(valueShape).c_str());
+        }
+
+        ValuePtr nodeValue;
+        switch (var.GetDataType())
+        {
+        case DataType::Float:
+            nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(var,
+                                                                           getGradient ? computationNode->As<ComputationNode<float>>()->Gradient() : computationNode->As<ComputationNode<float>>()->Value(),
+                                                                           computationNode->GetMBLayout());
+            break;
+        case DataType::Double:
+            nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(var,
+                                                                            getGradient ? computationNode->As<ComputationNode<double>>()->Gradient() : computationNode->As<ComputationNode<double>>()->Value(),
+                                                                            computationNode->GetMBLayout());
+            break;
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(var.GetDataType()));
+            break;
+        }
+
+        if (varValue == nullptr)
+        {
+            auto data = MakeSharedObject<NDArrayView>(var.GetDataType(), valueShape, AsDeviceDescriptor(computationNode->ValuePtr()->GetDeviceId()));
+            auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
+            varValue = MakeSharedObject<Value>(data, mask);
+        }
+        varValue->CopyFrom(*nodeValue);
+    }
+
    void CompositeFunction::GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs)
    {
        // Now copy the Forward values of output nodes from the network to outputs' Value objects
        for (auto outputVarValuePair : outputs)
-        {
-            auto computationNodePtr = m_variableToNodeMap[outputVarValuePair.first];
-            auto outputValuePtr = outputVarValuePair.second;
-
-            auto outputShape = GetValueShape(outputVarValuePair.first, computationNodePtr);
-            if (outputValuePtr != nullptr)
-            {
-                // TODO: The shape of the specified output Value object must match the actual output shape
-                if (outputValuePtr->Data()->Shape() != outputShape)
-                    InvalidArgument("The shape %s of the specified Value object for output does not match the actual output shape %s", AsString(outputValuePtr->Data()->Shape()).c_str(), AsString(outputShape).c_str());
-            }
-
-            ValuePtr nodeValue;
-            switch (outputVarValuePair.first.GetDataType())
-            {
-            case DataType::Float:
-                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Value(), computationNodePtr->GetMBLayout());
-                break;
-            case DataType::Double:
-                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Value(), computationNodePtr->GetMBLayout());
-                break;
-            default:
-                LogicError("Unsupported DataType %s", DataTypeName(outputVarValuePair.first.GetDataType()));
-                break;
-            }
-
-            if (outputValuePtr == nullptr)
-            {
-                auto data = MakeSharedObject<NDArrayView>(outputVarValuePair.first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
-                auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
-                outputValuePtr = MakeSharedObject<Value>(data, mask);
-            }
-            outputValuePtr->CopyFrom(*nodeValue);
-            outputs[outputVarValuePair.first] = outputValuePtr;
-        }
+            GetNodeOutputOrGradient(outputVarValuePair.first, outputs[outputVarValuePair.first], m_variableToNodeMap[outputVarValuePair.first], false /*getGradient*/);
    }

    void CompositeFunction::GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients)
@ -795,42 +843,11 @@ namespace CNTK
                InvalidArgument("Gradient value incorrectly requested for an Output or Constant Variable, or an Input Variable with NeedsGradient setting of false");

            auto computationNodePtr = m_variableToNodeMap[gradientVarValuePair.first];
-            auto gradientValuePtr = gradientVarValuePair.second;
-
-            auto gradientShape = GetValueShape(gradientVarValuePair.first, computationNodePtr);
-            if (gradientValuePtr != nullptr)
-            {
-                // TODO: The shape of the specified output Value object must match the actual output shape
-                if (gradientValuePtr->Data()->Shape() != gradientShape)
-                    InvalidArgument("The shape %s of the specified Value object for gradient does not match the actual gradient shape %s", AsString(gradientValuePtr->Data()->Shape()).c_str(), AsString(gradientShape).c_str());
-            }

            if (!computationNodePtr->NeedsGradient())
                LogicError("Backpropagated gradient value cannot be read from a ComputationNode that has NeedsGradient set to false");

-            ValuePtr nodeValue;
-            switch (gradientVarValuePair.first.GetDataType())
-            {
-            case DataType::Float:
-                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Gradient(), computationNodePtr->GetMBLayout());
-                break;
-            case DataType::Double:
-                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Gradient(), computationNodePtr->GetMBLayout());
-                break;
-            default:
-                LogicError("Unsupported DataType %s", DataTypeName(gradientVarValuePair.first.GetDataType()));
-                break;
-            }
-
-            if (gradientValuePtr == nullptr)
-            {
-                auto data = MakeSharedObject<NDArrayView>(gradientVarValuePair.first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
-                auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
-                gradientValuePtr = MakeSharedObject<Value>(data, mask);
-            }
-
-            gradientValuePtr->CopyFrom(*nodeValue);
-            gradients[gradientVarValuePair.first] = gradientValuePtr;
+            GetNodeOutputOrGradient(gradientVarValuePair.first, gradients[gradientVarValuePair.first], computationNodePtr, true /*getGradient*/);
        }
    }

@ -872,6 +889,8 @@ namespace CNTK
                outputsToEvaluate.push_back(m_variableToNodeMap[rootVarForBackprop]);
        }

+        ScopedNetworkOperationMode modeGuard(m_computationNetwork, outputsToRetainBackwardStateFor.empty() ? NetworkOperationMode::inferring : NetworkOperationMode::training);
+
        m_computationNetwork->ForwardProp(outputsToEvaluate);

        GetNetworkOutputs(outputs);
@ -907,6 +926,8 @@ namespace CNTK
        PopulateNetworkGradients(rootGradientValues);

        // Backpropagate through the network
+        ScopedNetworkOperationMode modeGuard(m_computationNetwork, NetworkOperationMode::training);
+
        auto rootComputationNodePtr = m_variableToNodeMap[rootGradientValues.begin()->first];
        m_computationNetwork->GetNestedNetwork(rootComputationNodePtr)->Backprop(FrameRange(nullptr), true, true);

@ -1045,9 +1066,11 @@ namespace CNTK
        return BinaryOp(PrimitiveOpType::GreaterEqual, leftOperand, rightOperand, Dictionary(), name);
    }

-    FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes /*= 1*/, const std::wstring& name/* = L""*/)
    {
-        return BinaryOp(PrimitiveOpType::Times, leftOperand, rightOperand, Dictionary(), name);
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"numOutputAxes"] = numOutputAxes;
+        return BinaryOp(PrimitiveOpType::Times, leftOperand, rightOperand, std::move(additionalProperties), name);
    }

    FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name/* = L""*/)
@ -1090,6 +1113,83 @@ namespace CNTK
        return UnaryOp(PrimitiveOpType::ReduceSum, operand, Dictionary(), name);
    }

+    FunctionPtr PerDimMeanVarianceNormalize(const Variable& operand, const NDArrayViewPtr& mean, const NDArrayViewPtr& invStdDev, const std::wstring& name /*= L""*/)
+    {
+        Constant meanVar(mean);
+        Constant invStdDevVar(invStdDev);
+
+        return ElementTimes(Minus(operand, meanVar), invStdDevVar);
+    }
+
+    FunctionPtr Convolution(const Variable& convolutionMap,
+                            const Variable& operand,
+                            const NDShape& strides,
+                            const std::vector<bool>& sharing,
+                            const std::vector<bool>& autoPadding,
+                            const NDShape& lowerPad,
+                            const NDShape& upperPad,
+                            bool transpose,
+                            size_t maxTempMemSizeInSamples,
+                            const std::wstring& name)
+    {
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"strides"] = strides;
+        additionalProperties[L"sharing"] = AsDictionaryValueVector(sharing);
+        additionalProperties[L"autoPadding"] = AsDictionaryValueVector(autoPadding);
+        additionalProperties[L"lowerPad"] = lowerPad;
+        additionalProperties[L"upperPad"] = upperPad;
+        additionalProperties[L"transpose"] = transpose;
+        additionalProperties[L"maxTempMemSizeInSamples"] = maxTempMemSizeInSamples;
+
+        return BinaryOp(PrimitiveOpType::Convolution, convolutionMap, operand, std::move(additionalProperties), name);
+    }
+
+    FunctionPtr Pooling(const Variable& operand,
+                        PoolingType poolingType,
+                        const NDShape& poolingWindowShape,
+                        const NDShape& strides,
+                        const std::vector<bool>& autoPadding,
+                        const NDShape& lowerPad,
+                        const NDShape& upperPad,
+                        const std::wstring& name)
+    {
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"poolingType"] = (size_t)poolingType;
+        additionalProperties[L"poolingWindowShape"] = poolingWindowShape;
+        additionalProperties[L"strides"] = strides;
+        additionalProperties[L"autoPadding"] = AsDictionaryValueVector(autoPadding);
+        additionalProperties[L"lowerPad"] = lowerPad;
+        additionalProperties[L"upperPad"] = upperPad;
+
+        return UnaryOp(PrimitiveOpType::Pooling, operand, std::move(additionalProperties), name);
+    }
+
+    FunctionPtr BatchNormalization(const Variable& operand,
+                                   const Variable& scale,
+                                   const Variable& bias,
+                                   const Variable& runningMean,
+                                   const Variable& runningInvStd,
+                                   bool spacial,
+                                   double normalizationTimeConstant,
+                                   double blendTimeConstant,
+                                   double epsilon,
+                                   bool useCuDNNEngine,
+                                   const std::wstring& name)
+    {
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"spacial"] = spacial;
+        additionalProperties[L"normalizationTimeConstant"] = normalizationTimeConstant;
+        additionalProperties[L"blendTimeConstant"] = blendTimeConstant;
+        additionalProperties[L"epsilon"] = epsilon;
+        additionalProperties[L"useCuDNNEngine"] = useCuDNNEngine;
+
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::BatchNormalization,
+                                                                             std::vector<Variable>({ operand, scale, bias, runningMean, runningInvStd }),
+                                                                             std::move(additionalProperties),
+                                                                             name),
+                                         name);
+    }
+
    FunctionPtr Combine(const std::vector<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
    {
        std::unordered_set<FunctionPtr> uniqueOperands;
--- a/Source/CNTKv2LibraryDll/Function.h
+++ b/Source/CNTKv2LibraryDll/Function.h
@ -10,6 +10,7 @@
 #include <iterator>
 #include "ComputationNetwork.h"
 #include "Utils.h"
+#include "ConvolveGeometry.h"

 namespace CNTK
 {
@ -26,6 +27,7 @@ namespace CNTK
        Abs,
        Reciprocal,
        Softmax,
+        Pooling,
        Plus,
        Minus,
        ElementTimes,
@ -36,12 +38,14 @@ namespace CNTK
        Greater,
        GreaterEqual,
        Times,
+        Convolution,
        SquaredError,
        CrossEntropyWithSoftmax,
        ClassificationError,
        PastValue,
        FutureValue,
        ReduceSum,
+        BatchNormalization,
        Combine,
    };
 }
@ -73,6 +77,7 @@ namespace CNTK
            { PrimitiveOpType::Abs, "Abs" },
            { PrimitiveOpType::Reciprocal, "Reciprocal" },
            { PrimitiveOpType::Softmax, "Softmax" },
+            { PrimitiveOpType::Pooling, "Pooling" },
            { PrimitiveOpType::Plus, "Plus" },
            { PrimitiveOpType::Minus, "Minus" },
            { PrimitiveOpType::ElementTimes, "ElementTimes" },
@ -83,12 +88,14 @@ namespace CNTK
            { PrimitiveOpType::Greater, "Greater" },
            { PrimitiveOpType::GreaterEqual, "GreaterEqual" },
            { PrimitiveOpType::Times, "Times" },
+            { PrimitiveOpType::Convolution, "Convolution" },
            { PrimitiveOpType::SquaredError, "SquaredError" },
            { PrimitiveOpType::CrossEntropyWithSoftmax, "CrossEntropyWithSoftmax" },
            { PrimitiveOpType::ClassificationError, "ClassificationError" },
            { PrimitiveOpType::PastValue, "PastValue" },
            { PrimitiveOpType::FutureValue, "FutureValue" },
            { PrimitiveOpType::ReduceSum, "ReduceSum" },
+            { PrimitiveOpType::BatchNormalization, "BatchNormalization" },
            { PrimitiveOpType::Combine, "Combine" }
        };

@ -102,7 +109,7 @@ namespace CNTK
    {
    public:
        PrimitiveFunction(PrimitiveOpType op, const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& functionName = L"")
-            : Function(inputs, GetOutputVariables(op, inputs, this), nullptr, functionName), m_op(op), m_functionConfig(std::move(functionConfig))
+            : Function(inputs, GetOutputVariables(op, inputs, this, functionConfig), nullptr, functionName), m_op(op), m_functionConfig(std::move(functionConfig))
        {
        }

@ -169,25 +176,28 @@ namespace CNTK
            return NDShape(std::move(outputDims));
        }

-        static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape)
+        static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape, size_t numOutputAxes)
        {
-            if (rightOperandShape.NumAxes() > 2)
-                RuntimeError("The right operand of a times operation can have at most 2 axes");
+            if (numOutputAxes == 0)
+                InvalidArgument("Output #axes of times operation should be at least one");

-            size_t numOutputAxes = rightOperandShape.NumAxes();
+            if (numOutputAxes > leftOperandShape.NumAxes())
+                InvalidArgument("Output #axes of times operation can at most be the #axes of the left operand");

-            if (leftOperandShape.NumAxes() != 2)
-                RuntimeError("The left operand of a times operation must have 2 axes");
+            size_t numReductionAxes = leftOperandShape.NumAxes() - numOutputAxes;

-            std::vector<size_t> outputDims(numOutputAxes);
-            outputDims[0] = leftOperandShape[0];
-            if (numOutputAxes > 1)
-                outputDims[1] = rightOperandShape[1];
+            // The 'numReductionAxes' trailing dimensions of the left operand's shape must match the corresponding leading
+            // dimensions of the right operand

-            if (leftOperandShape[1] != rightOperandShape[0])
-                RuntimeError("Left operand's shape %s is not compatible with right operand's shape %s for the times operation", AsString(leftOperandShape).c_str(), AsString(rightOperandShape).c_str());
+            if (rightOperandShape.NumAxes() != numReductionAxes)
+                RuntimeError("The right operand's #axes in a times operation should equal #axes being reduced over!");

-            return NDShape(std::move(outputDims));
+            if (leftOperandShape.SubShape(numOutputAxes) != rightOperandShape)
+                InvalidArgument("The trailing dimensions of the left operand (%s) do not match the right operand's dimensions (%s)",
+                                AsString(leftOperandShape.SubShape(numOutputAxes)).c_str(),
+                                AsString(rightOperandShape).c_str());
+
+            return leftOperandShape.SubShape(0, numOutputAxes);
        }

        static NDShape ReductionOpOutputShape(PrimitiveOpType op, const NDShape& operandShape, const std::vector<size_t>& reductionAxes)
@ -209,8 +219,22 @@ namespace CNTK
            return NDShape(std::move(outputDims));
        }

+        static NDShape ConvolutionOpOutputShape(const NDShape& operandShape, const NDShape& kernelShape, const NDShape& outputMapCount, const NDShape& strides,
+                                                const std::vector<bool>& sharing,
+                                                std::vector<bool>& autoPad, const NDShape& lowerPad, const NDShape& upperPad,
+                                                bool transpose)
+        {
+            decltype(&Microsoft::MSR::CNTK::ConvolveGeometry::ComputeOutputShape) computeOutputShapeFunc;
+            if (!transpose)
+                computeOutputShapeFunc = &Microsoft::MSR::CNTK::ConvolveGeometry::ComputeOutputShape;
+            else
+                computeOutputShapeFunc = &Microsoft::MSR::CNTK::ConvolveGeometry::ComputeInputShape;
+
+            return AsNDShape(computeOutputShapeFunc(AsTensorShape(operandShape, true), AsTensorShape(kernelShape, true), AsTensorShape(outputMapCount, true), AsTensorShape(strides, true), sharing, autoPad, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true)));
+        }
+
        // TODO: Reconcile this with the ComputationNode::Validate functionality in core CNTK to avoid duplication of inference logic
-        static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner)
+        static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner, const Dictionary& functionConfig)
        {
            std::vector<Variable> outputs;

@ -247,6 +271,17 @@ namespace CNTK
                assert(inputs.size() == 1);
                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, outputDynamicAxes));
                break;
+            case PrimitiveOpType::Pooling:
+            {
+                assert(inputs.size() == 1);
+                auto poolingWindowsShape = functionConfig[L"poolingWindowShape"].GetValue<NDShape>();
+                auto strides = functionConfig[L"strides"].GetValue<NDShape>();
+                auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
+                auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
+                auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
+                outputs.push_back(Variable(ConvolutionOpOutputShape(inputs[0].Shape(), poolingWindowsShape, { 1 }, strides, { true }, autoPadding, lowerPad, upperPad, false), outputDataType, owner, outputDynamicAxes));
+                break;
+            }
            case PrimitiveOpType::Plus:
            case PrimitiveOpType::Minus:
            case PrimitiveOpType::ElementTimes:
@ -260,9 +295,34 @@ namespace CNTK
                outputs.push_back(Variable(BinaryElementwiseOpOutputShape(op, inputs[0].Shape(), inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
                break;
            case PrimitiveOpType::Times:
+            {
                assert(inputs.size() == 2);
-                outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
+
+                // TODO: Support dynamic axes on the left operand
+                if (!inputs[0].DynamicAxes().empty())
+                    LogicError("Dynamic axes are currently unsupported for left operand of a Times operation");
+
+                size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
+                outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape(), numOutputAxes), outputDataType, owner, outputDynamicAxes));
                break;
+            }
+            case PrimitiveOpType::Convolution:
+            {
+                assert(inputs.size() == 2);
+                auto strides = functionConfig[L"strides"].GetValue<NDShape>();
+                auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
+                auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
+                auto sharing = AsBasicElementTypeVector<bool>(functionConfig[L"sharing"].GetValue<std::vector<DictionaryValue>>());
+                auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
+                bool transpose = functionConfig[L"transpose"].GetValue<bool>();
+                if (inputs[0].Shape().NumAxes() < inputs[1].Shape().NumAxes())
+                    InvalidArgument("The convolution map should have at least as many axes as the shape of the input it operates on!");
+
+                NDShape outputMapCount, kernelShape;
+                std::tie(outputMapCount, kernelShape) = GetConvolutionOutputMapCountAndKernelShape(inputs[0].Shape(), inputs[1].Shape());
+                outputs.push_back(Variable(ConvolutionOpOutputShape(inputs[1].Shape(), kernelShape, outputMapCount, strides, sharing, autoPadding, lowerPad, upperPad, transpose), outputDataType, owner, outputDynamicAxes));
+                break;
+            }
            case PrimitiveOpType::SquaredError:
            case PrimitiveOpType::CrossEntropyWithSoftmax:
            case PrimitiveOpType::ClassificationError:
@ -303,6 +363,9 @@ namespace CNTK
                outputs.push_back(Variable(ReductionOpOutputShape(op, inputs[0].Shape(), reductionAxes), outputDataType, owner, reductionOutputDynamicAxes));
                break;
            }
+            case PrimitiveOpType::BatchNormalization:
+                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, outputDynamicAxes));
+                break;
            case PrimitiveOpType::Combine:
                outputs = inputs;
                break;
@ -350,6 +413,10 @@ namespace CNTK
        template <typename ElementType>
        friend void SaveAsLegacyModel(const FunctionPtr& rootFunction, const std::wstring& modelFile);

+        friend void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
+                                                         std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndInvStdDevs,
+                                                         const DeviceDescriptor& device /*= DeviceDescriptor::CPUDevice()*/);
+
    public:
        static CompositeFunctionPtr Create(const FunctionPtr& rootFunction, const std::wstring& name = L"")
        {
@ -425,6 +492,7 @@ namespace CNTK
        static void PopulateComputationNodeGradient(const std::pair<Variable, ValuePtr>& variableGradient, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
        void PopulateNetworkGradients(const std::unordered_map<Variable, ValuePtr>& gradients);

+        static void GetNodeOutputOrGradient(Variable var, ValuePtr& varValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode, bool getGradient);
        void GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs);
        void GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients);

--- a/Source/CNTKv2LibraryDll/MinibatchSource.cpp
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.cpp
@ -11,6 +11,8 @@
 #include "HeapMemoryProvider.h"
 #include "ReaderShim.h"
 #include "Function.h"
+#include <tuple>
+#include "ComputationNetworkBuilder.h"

 using namespace Microsoft::MSR::CNTK;

@ -22,21 +24,21 @@ namespace CNTK
    }

    CompositeMinibatchSource::CompositeMinibatchSource(const Dictionary& configuration)
-        : m_startNewEpoch(true), m_nextEpochIndex(0), m_prevMinibatchSize(0)
+        : m_epochEndReached(false), m_prevMinibatchSize(0), m_epochSize(SIZE_MAX)
    {
        ConfigParameters config;
        std::wstringstream s;
        for (const auto& keyValuePair : *(configuration.m_dictionaryData))
-        {
            AddConfigString(s, keyValuePair.first, keyValuePair.second, 0);
-        }
+
        config.Parse(msra::strfun::utf8(s.str()));

        const wchar_t* epochSizeConfigurationKey = L"epochSize";
-        if (!configuration.Contains(epochSizeConfigurationKey))
-            InvalidArgument("'epochSize' value must be configured when constructing a CNTK built-in composite MinibatchSource!");
+        if (configuration.Contains(epochSizeConfigurationKey))
+            m_epochSize = configuration[epochSizeConfigurationKey].GetValue<size_t>();

-        m_epochSize = configuration[epochSizeConfigurationKey].GetValue<size_t>();
+        if (m_epochSize == 0)
+            m_epochSize = Microsoft::MSR::CNTK::requestDataSize;

        typedef Reader*(*CreateCompositeDataReaderProc)(const ConfigParameters* parameters);
        CreateCompositeDataReaderProc createReaderProc = (CreateCompositeDataReaderProc)Plugin().Load(L"CompositeDataReader", "CreateCompositeDataReader");
@ -47,79 +49,198 @@ namespace CNTK
            m_streamInfos.insert({ streamDesc->m_name, streamDesc->m_id, AsStorageFormat(streamDesc->m_storageType), AsDataType(streamDesc->m_elementType), AsNDShape(*(streamDesc->m_sampleLayout)) });
    }

-    /*virtual*/ bool CompositeMinibatchSource::GetNextMinibatch(std::unordered_map<StreamInfo, std::pair<size_t, ValuePtr>>& minibatchData) /*override*/
+    /*virtual*/ std::unordered_map<StreamInfo, MinibatchData> CompositeMinibatchSource::GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+                                                                                                         const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/) /*override*/
    {
-        // TODO: Support different minibatch sizes for different streams
-        size_t requestedMinibatchSize = 0;
-        for (const auto& val : minibatchData)
+        std::unordered_map<StreamInfo, MinibatchData> minibatchData;
+        if (!m_epochEndReached)
        {
-            if (requestedMinibatchSize == 0)
-                requestedMinibatchSize = val.second.first;
-            else
+            // TODO: Support different minibatch sizes for different streams
+            size_t requestedMinibatchSizeInSamples = 0;
+            for (const auto& val : perStreamMBSizeLimits)
            {
-                if (requestedMinibatchSize != val.second.first)
-                    LogicError("Different minibatch sizes across different input streams is currently unsupported!");
-            }
-        }
+                size_t maxNumSequencesRequested = val.second.first;
+                size_t maxNumSamplesRequested = val.second.second;

-        if (requestedMinibatchSize == 0)
-            InvalidArgument("GetNextMinibatch: Requested minibatch sizes must be > 0");
+                // TODO: Specifying minibatch size in #sequences is currently unsupported
+                if (maxNumSequencesRequested != 0)
+                    LogicError("Specifying minibatch size in #sequences is currently unsupported");

-        if (m_startNewEpoch)
-        {
-            // TODO: Add support for distributed reading
-            EpochConfiguration epochConfig = { 1, 0, requestedMinibatchSize, m_epochSize, m_nextEpochIndex, 0 };
-            m_compositeDataReader->StartEpoch(epochConfig);
-            m_prevMinibatchSize = requestedMinibatchSize;
-        }
-
-        if (requestedMinibatchSize != m_prevMinibatchSize)
-            LogicError("GetNextMinibatch: Changing minibatch sizes across calls is currently unsupported");
-
-        auto compositeReaderMinibatchData = m_compositeDataReader->ReadMinibatch();
-        m_startNewEpoch = compositeReaderMinibatchData.m_endOfEpoch;
-        if (m_startNewEpoch)
-            m_nextEpochIndex++;
-
-        auto compositeDataReaderStreamDescs = m_compositeDataReader->GetStreamDescriptions();
-        size_t numStreams = compositeDataReaderStreamDescs.size();
-        for (size_t i = 0; i < numStreams; ++i)
-        {
-            auto currentStreamDesc = compositeDataReaderStreamDescs[i];
-            auto sampleShape = AsNDShape(*(currentStreamDesc->m_sampleLayout));
-            auto minibatchDataEntryForCurrentStream = std::find_if(minibatchData.begin(), minibatchData.end(), [currentStreamDesc](const std::pair<StreamInfo, std::pair<size_t, ValuePtr>>& entry) {
-                return entry.first.m_id == currentStreamDesc->m_id;
-            });
-
-            auto minibatchValuePtr = minibatchDataEntryForCurrentStream->second.second;
-            if (compositeReaderMinibatchData.m_data.empty())
-            {
-                minibatchValuePtr = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(minibatchDataEntryForCurrentStream->first.m_elementType, sampleShape.AppendShape({ 0, 0 }), DeviceDescriptor::CPUDevice()));
-                continue;
-            }
-
-            auto currentStreamMinibatchData = compositeReaderMinibatchData.m_data[i];
-
-            if (currentStreamDesc->m_elementType == ElementType::tfloat)
-            {
-                auto dataMatrix = std::make_shared<Matrix<float>>(CPUDEVICE);
-                size_t sampleSize = currentStreamDesc->m_sampleLayout->GetNumElements();
-
-                // TODO: Eliminate the unnecessary CPU to CPU copy
-                ReaderShim<float>::FillMatrixFromStream(currentStreamDesc->m_storageType, dataMatrix.get(), sampleSize, currentStreamMinibatchData);
-                auto minibatchValueObject = CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(sampleShape, *dataMatrix, currentStreamMinibatchData->m_layout, false);
-
-                // TODO: Should slice off the supplied Value object instead of reallocating, in cases the actual minibatch 
-                // size is smaller than the supplied storage in the Value object
-                if ((minibatchValuePtr == nullptr) || (minibatchValuePtr->Data()->Shape() != minibatchValueObject->Data()->Shape()))
-                    minibatchData[minibatchDataEntryForCurrentStream->first].second = minibatchValueObject;
+                if (requestedMinibatchSizeInSamples == 0)
+                    requestedMinibatchSizeInSamples = maxNumSamplesRequested;
                else
-                    minibatchValuePtr->CopyFrom(*minibatchValueObject);
+                {
+                    if (requestedMinibatchSizeInSamples != maxNumSamplesRequested)
+                        LogicError("Different minibatch sizes across different input streams is currently unsupported!");
+                }
+            }
+
+            if (requestedMinibatchSizeInSamples == 0)
+                InvalidArgument("GetNextMinibatch: Requested minibatch sizes must be > 0");
+
+            if (m_prevMinibatchSize == 0)
+            {
+                // TODO: Add support for distributed reading
+                EpochConfiguration epochConfig = { 1, 0, requestedMinibatchSizeInSamples, m_epochSize, 0, 0 };
+                m_compositeDataReader->StartEpoch(epochConfig);
+                m_prevMinibatchSize = requestedMinibatchSizeInSamples;
+            }
+
+            if (requestedMinibatchSizeInSamples != m_prevMinibatchSize)
+                LogicError("GetNextMinibatch: Changing minibatch sizes across calls is currently unsupported");
+
+            auto compositeReaderMinibatchData = m_compositeDataReader->ReadMinibatch();
+            m_epochEndReached = compositeReaderMinibatchData.m_endOfEpoch;
+
+            auto compositeDataReaderStreamDescs = m_compositeDataReader->GetStreamDescriptions();
+            size_t numStreams = compositeDataReaderStreamDescs.size();
+            for (size_t i = 0; i < numStreams; ++i)
+            {
+                auto currentStreamDesc = compositeDataReaderStreamDescs[i];
+                auto iter = std::find_if(perStreamMBSizeLimits.begin(), perStreamMBSizeLimits.end(), [currentStreamDesc](const std::pair<StreamInfo, std::pair<size_t, size_t>>& entry) {
+                    return entry.first.m_id == currentStreamDesc->m_id;
+                });
+
+                if (iter == perStreamMBSizeLimits.end())
+                    continue;
+
+                auto& currentStreamInfo = iter->first;
+                auto sampleShape = AsNDShape(*(currentStreamDesc->m_sampleLayout));
+
+                ValuePtr minibatchValuePtr;
+                if (compositeReaderMinibatchData.m_data.empty())
+                {
+                    minibatchValuePtr = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(currentStreamInfo.m_elementType, sampleShape.AppendShape({ 0, 0 }), DeviceDescriptor::CPUDevice()));
+                    continue;
+                }
+
+                auto currentStreamMinibatchData = compositeReaderMinibatchData.m_data[i];
+                if (currentStreamDesc->m_elementType == ElementType::tfloat)
+                {
+                    auto dataMatrix = std::make_shared<Matrix<float>>(CPUDEVICE);
+                    size_t sampleSize = currentStreamDesc->m_sampleLayout->GetNumElements();
+
+                    // TODO: Eliminate the unnecessary CPU to CPU copy
+                    ReaderShim<float>::FillMatrixFromStream(currentStreamDesc->m_storageType, dataMatrix.get(), sampleSize, currentStreamMinibatchData);
+                    minibatchValuePtr = CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(sampleShape, *dataMatrix, currentStreamMinibatchData->m_layout, false);
+
+                    size_t numSamples = currentStreamMinibatchData->m_layout->GetActualNumSamples();
+                    size_t numSequences = currentStreamMinibatchData->m_layout->GetNumSequences();
+
+                    minibatchData[currentStreamInfo] = { numSequences, numSamples, minibatchValuePtr };
+                }
+                else
+                    LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
            }
-            else
-                LogicError("Double precision input data is currently unsupported by the CNTK built-in composite MinibatchSource!");
        }

-        return true;
+        return minibatchData;
+    }
+
+    void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
+                                              std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndInvStdDevs,
+                                              const DeviceDescriptor& device /*= DeviceDescriptor::CPUDevice()*/)
+    {
+        typedef std::shared_ptr<ComputationNode<float>> ComputationNodePtr;
+        const auto& minibatchSourceStreams = minibatchSource->StreamInfos();
+
+        auto computationNetwork = std::make_shared<ComputationNetwork>(AsCNTKImplDeviceId(device));
+        ComputationNetworkBuilder<float> builder(*computationNetwork);
+
+        std::vector<ComputationNodeBasePtr> allInputNodes;
+        std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToInputNodeMap;
+        std::unordered_map<StreamInfo, Variable> streamToDummyInputVariableMap;
+        std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToMeanNodeMap;
+        std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToInvStdDevNodeMap;
+
+        size_t totalSizePerSample = 0;
+        for (auto& currentStreamKV : computedMeanAndInvStdDevs)
+        {
+            auto currentStreamInfo = currentStreamKV.first;
+            if (minibatchSourceStreams.find(currentStreamInfo) == minibatchSourceStreams.end())
+                InvalidArgument("ComputeMeanAndVariance: Stream for which mean and variance is to be computed is not supported by the specified minibatchSource");
+
+            if (currentStreamInfo.m_elementType != DataType::Float)
+                LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
+
+            auto inputVariableShape = currentStreamInfo.m_sampleLayout;
+            auto inputTensorShape = AsTensorShape(inputVariableShape);
+            totalSizePerSample += (inputVariableShape.TotalSize() * sizeof(float));
+
+            ComputationNodePtr inputNode;
+            Variable inputVariable;
+            if (currentStreamInfo.m_storageFormat != StorageFormat::Dense)
+            {
+                inputNode = builder.CreateSparseInputNode(currentStreamInfo.m_name, inputTensorShape);
+                inputVariable = Variable(inputVariableShape, true, DataType::Float, currentStreamInfo.m_name);
+            }
+            else
+            {
+                inputNode = builder.CreateInputNode(currentStreamInfo.m_name, inputTensorShape);
+                inputVariable = Variable(inputVariableShape, DataType::Float, currentStreamInfo.m_name);
+            }
+
+            allInputNodes.push_back(inputNode);
+            streamToInputNodeMap[currentStreamInfo] = inputNode;
+            streamToDummyInputVariableMap[currentStreamInfo] = inputVariable;
+            streamToMeanNodeMap[currentStreamInfo] = builder.Mean(inputNode);
+            streamToInvStdDevNodeMap[currentStreamInfo] = builder.InvStdDev(inputNode);
+        }
+
+        computationNetwork->CompileNetwork();
+        computationNetwork->AllocateAllMatrices(computationNetwork->RootNodes(), {}, nullptr);
+
+        ScopedNetworkOperationMode modeGuard(computationNetwork, NetworkOperationMode::preComputing);
+
+        // initialize
+        auto preComputeNodes = computationNetwork->GetNodesRequiringPreComputation();
+        for (auto & preComputeNode : preComputeNodes)
+            dynamic_pointer_cast<IPreComputeNode>(preComputeNode)->MarkComputed(false /*begin accumulating*/);
+
+        const size_t maxMinibatchDataSize = (1 << 27); // 128 MB
+        const size_t minibatchSize = maxMinibatchDataSize / totalSizePerSample;
+        std::unordered_map<StreamInfo, std::pair<size_t, size_t>> minibatchSizeLimits;
+        for (auto& currentStreamKV : computedMeanAndInvStdDevs)
+            minibatchSizeLimits.insert(std::make_pair(currentStreamKV.first, std::make_pair((size_t)0, minibatchSize)));
+
+        for (;;)
+        {
+            auto minibatchData = minibatchSource->GetNextMinibatch(minibatchSizeLimits, device);
+            if (minibatchData.empty())
+                break;
+
+            for (auto& currentStreamKV : computedMeanAndInvStdDevs)
+                CompositeFunction::PopulateComputationNodeValue<float>({ streamToDummyInputVariableMap[currentStreamKV.first], minibatchData[currentStreamKV.first].m_data }, streamToInputNodeMap[currentStreamKV.first]);
+
+            ComputationNetwork::BumpEvalTimeStamp(allInputNodes);
+
+            computationNetwork->ForwardProp(preComputeNodes);
+        }
+
+        // finalize
+        for (auto & preComputeNode : preComputeNodes)
+            dynamic_pointer_cast<IPreComputeNode>(preComputeNode)->MarkComputed(true /*done accumulating*/);
+
+        // Copy out the results
+        for (auto& currentStreamKV : computedMeanAndInvStdDevs)
+        {
+            ValuePtr mean, invStdDev;
+            if (computedMeanAndInvStdDevs[currentStreamKV.first].first != nullptr)
+                mean = MakeSharedObject<Value>(computedMeanAndInvStdDevs[currentStreamKV.first].first);
+
+            if (computedMeanAndInvStdDevs[currentStreamKV.first].second != nullptr)
+                invStdDev = MakeSharedObject<Value>(computedMeanAndInvStdDevs[currentStreamKV.first].second);
+
+            CompositeFunction::GetNodeOutputOrGradient(streamToDummyInputVariableMap[currentStreamKV.first], mean, streamToMeanNodeMap[currentStreamKV.first], false /*getGradient*/);
+            CompositeFunction::GetNodeOutputOrGradient(streamToDummyInputVariableMap[currentStreamKV.first], invStdDev, streamToInvStdDevNodeMap[currentStreamKV.first], false /*getGradient*/);
+
+            if (computedMeanAndInvStdDevs[currentStreamKV.first].first == nullptr)
+                computedMeanAndInvStdDevs[currentStreamKV.first].first = mean->Data();
+
+            if (computedMeanAndInvStdDevs[currentStreamKV.first].second == nullptr)
+                computedMeanAndInvStdDevs[currentStreamKV.first].second = invStdDev->Data();
+
+        }
    }
 }
+
--- a/Source/CNTKv2LibraryDll/MinibatchSource.h
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.h
@ -19,15 +19,14 @@ namespace CNTK

        virtual const std::unordered_set<StreamInfo>& StreamInfos() override { return m_streamInfos; }

-        virtual bool GetNextMinibatch(std::unordered_map<StreamInfo, std::pair<size_t, ValuePtr>>& minibatchData) override;
+        virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+                                                                               const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) override;

    private: 
        std::unordered_set<StreamInfo> m_streamInfos;
        std::shared_ptr<Microsoft::MSR::CNTK::Reader> m_compositeDataReader;
-        bool m_startNewEpoch;
-        size_t m_nextEpochIndex;
+        bool m_epochEndReached;
        size_t m_prevMinibatchSize;
        size_t m_epochSize;
    };
 }
-
--- a/Source/CNTKv2LibraryDll/NDArrayView.cpp
+++ b/Source/CNTKv2LibraryDll/NDArrayView.cpp
@ -316,7 +316,17 @@ namespace CNTK
    }

    template <typename ElementType>
-    NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
+    /*static*/ NDArrayViewPtr NDArrayView::RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/)
+    {
+        auto matrixDims = GetMatrixDimensions(shape);
+        auto randomNormalMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomGaussian(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)mean, (ElementType)stdDev, seed));
+        auto tensorView = new TensorView<ElementType>(randomNormalMatrix, AsTensorShape(shape));
+
+        return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
+    }
+
+    template <typename ElementType>
+    /*static*/ NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
    {
        auto matrixDims = GetMatrixDimensions(shape);
        auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeBegin, (ElementType)rangeEnd, seed));
@ -329,6 +339,9 @@ namespace CNTK
    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);

+    template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
+    template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<double>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
+
    template CNTK_API const float* NDArrayView::DataBuffer<float>() const;
    template CNTK_API const double* NDArrayView::DataBuffer<double>() const;

--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -11,6 +11,7 @@
 #include <string>
 #include "Config.h"
 #include "Reader.h"
+#include "ConvolutionEngine.h"

 namespace CNTK
 {
@ -118,14 +119,15 @@ namespace CNTK
        }
    }

-    inline Microsoft::MSR::CNTK::TensorShape AsTensorShape(const NDShape& viewShape)
+    inline Microsoft::MSR::CNTK::TensorShape AsTensorShape(const NDShape& viewShape, bool preserveRank = false)
    {
        const size_t maxNumAxesSupportedByTensorView = 12;
        if (viewShape.NumAxes() > maxNumAxesSupportedByTensorView)
            LogicError("The number of requested axes exceeds the currently supported limit");

        // TensorShape is required to be at least 2D
-        Microsoft::MSR::CNTK::SmallVector<size_t> tensorViewShape(std::max<size_t>(2, viewShape.NumAxes()));
+        size_t minRankSize = preserveRank ? viewShape.NumAxes() : 2;
+        Microsoft::MSR::CNTK::SmallVector<size_t> tensorViewShape(std::max<size_t>(minRankSize, viewShape.NumAxes()));
        for (size_t i = 0; i < tensorViewShape.size(); ++i)
            tensorViewShape[i] = (i < viewShape.NumAxes()) ? viewShape[i] : 1;

@ -241,4 +243,74 @@ namespace CNTK
        AddConfigString(s, value, numIndentationSpaces);
        s << std::endl;
    }
+
+    template <typename T>
+    inline std::vector<DictionaryValue> AsDictionaryValueVector(const std::vector<T>& basicElementTypeVector)
+    {
+        static_assert(std::is_same<T, bool>::value ||
+                      std::is_same<T, size_t>::value ||
+                      std::is_same<T, float>::value ||
+                      std::is_same<T, double>::value, "Unsupported ValueType");
+
+        std::vector<DictionaryValue> dictionaryValueVector;
+        for (auto value : basicElementTypeVector)
+            dictionaryValueVector.push_back(value);
+
+        return dictionaryValueVector;
+    }
+
+    template <typename T>
+    inline std::vector<T> AsBasicElementTypeVector(const std::vector<DictionaryValue>& dictionaryValueVector)
+    {
+        static_assert(std::is_same<T, bool>::value ||
+            std::is_same<T, size_t>::value ||
+            std::is_same<T, float>::value ||
+            std::is_same<T, double>::value, "Unsupported ValueType");
+
+        std::vector<T> basicElementTypeVector;
+        for (auto value : dictionaryValueVector)
+            basicElementTypeVector.push_back(value.GetValue<T>());
+
+        return basicElementTypeVector;
+    }
+
+    inline PoolingType AsPoolingType(Microsoft::MSR::CNTK::PoolKind cntkPoolingKind)
+    {
+        switch (cntkPoolingKind)
+        {
+        case Microsoft::MSR::CNTK::PoolKind::Average:
+            return PoolingType::Average;
+        case Microsoft::MSR::CNTK::PoolKind::Max:
+            return PoolingType::Max;
+        default:
+            LogicError("Unknown pooling type");
+        }
+    }
+
+    inline Microsoft::MSR::CNTK::PoolKind AsCNTKPoolKind(PoolingType poolingType)
+    {
+        switch (poolingType)
+        {
+        case PoolingType::Average:
+            return Microsoft::MSR::CNTK::PoolKind::Average;
+        case PoolingType::Max:
+            return Microsoft::MSR::CNTK::PoolKind::Max;
+        default:
+            LogicError("Unknown pooling type");
+        }
+    }
+
+    inline std::pair<NDShape, NDShape> GetConvolutionOutputMapCountAndKernelShape(const NDShape& convolutionMapShape, const NDShape& operandShape)
+    {
+        auto outputMapCount = convolutionMapShape.SubShape(0, convolutionMapShape.NumAxes() - operandShape.NumAxes());
+        NDShape paddedOutputMapCount(operandShape.NumAxes(), 1);
+        for (size_t i = 0; i < outputMapCount.NumAxes(); ++i)
+            paddedOutputMapCount[paddedOutputMapCount.NumAxes() - 1 - i] = outputMapCount[outputMapCount.NumAxes() - 1 - i];
+        //for (size_t i = 0; i < outputMapCount.NumAxes(); ++i)
+        //    paddedOutputMapCount[i] = outputMapCount[i];
+
+        NDShape kernelShape = convolutionMapShape.SubShape(outputMapCount.NumAxes());
+
+        return{ paddedOutputMapCount, kernelShape };
+    }
 }
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -433,7 +433,18 @@ private:
    {
        if (HasMBLayout())
            LogicError("%ls: Minibatch data cannot be interpreted as a single 2D tensor.", NodeDescription().c_str());
-        else if (m_sampleLayout.GetRank() < 1 || m_sampleLayout.GetRank() > 2) // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
+
+        bool notFlattenableTo2D = false;
+        for (size_t i = 2; i < m_sampleLayout.GetRank(); ++i)
+        {
+            if (!m_sampleLayout.CanFlatten(i))
+            {
+                notFlattenableTo2D = true;
+                break;
+            }
+        }
+
+        if (m_sampleLayout.GetRank() < 1 || ((m_sampleLayout.GetRank() > 2) && notFlattenableTo2D)) // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
            LogicError("%ls: Sample [%s] is not a column vector or matrix (1D or 2D tensor).", NodeDescription().c_str(), string(m_sampleLayout).c_str());
    }
 public:
@ -445,7 +456,11 @@ public:
    size_t GetAsMatrixNumCols() const
    {
        CheckTensorIsMatrix();
-        return m_sampleLayout.GetRank() > 1 ? m_sampleLayout[1] : 1; // a column vector is also a Matrix
+        auto flattenedLayout = m_sampleLayout;
+        if (flattenedLayout.GetRank() > 2)
+            flattenedLayout.FlattenTo2DInPlace(1, "GetAsMatrixNumCols()");
+
+        return flattenedLayout.GetRank() > 1 ? flattenedLayout[1] : 1; // a column vector is also a Matrix
    }

    // setting/updating the dimensions of the node
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -139,6 +139,16 @@ public:
        fstream << "PoolKind: " << (int)m_poolKind << "\n";
    }

+    TensorShape KernelShape() const { return m_kernelShape; }
+    TensorShape Strides() const { return m_stride; }
+    std::vector<bool> Sharing() const { return m_sharing; }
+    std::vector<bool> AutoPad() const { return m_autoPad; }
+    TensorShape LowerPad() const { return m_lowerPad; }
+    TensorShape UpperPad() const { return m_upperPad; }
+    bool Transpose() const { return m_transpose; }
+    size_t MaxTempMemSizeInSamples() const { return m_maxTempMemSizeInSamples; }
+    PoolKind PoolingKind() const { return m_poolKind; }
+
 protected:
    TensorShape m_kernelShape;
    TensorShape m_mapCount;
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -463,6 +463,8 @@ public:
        Base::AllocateGradientMatricesForInputs(matrixPool);
    }

+    size_t OutputRank() const { return m_outputRank; }
+
 private:
    size_t m_outputRank;
 };
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -1872,6 +1872,12 @@ public:
        m_blendTimeConst = std::numeric_limits<double>::infinity();
    }

+    double NormalizationTimeConstant() const { return m_normTimeConst; }
+    double BlendTimeConstant() const { return m_blendTimeConst; }
+    bool Spatial() const { return m_spatial; }
+    double Epsilon() const { return m_epsilon; }
+    bool UseCNTKEngine() const { return m_useCntkEngine; }
+
 private:
    // Old versioning - do not use. Do not remove until we're sure there are no old models around.
    struct VersionInfo
--- a/Tests/EndToEndTests/CNTKv2Library/UnitTests/run-test
+++ b/Tests/EndToEndTests/CNTKv2Library/UnitTests/run-test
@ -10,15 +10,16 @@ if [[ "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" == "" || ! -d "$CNTK_EXTERNAL_T
 fi

 if [ "$OS" == "Windows_NT" ]; then
-    DataSourceDir=`cygpath -au $CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY`/Image/MNIST/v0
+    DataSourceDir=`cygpath -au $CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY`/Image
 else
-    DataSourceDir=$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY/Image/MNIST/v0
+    DataSourceDir=$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY/Image
 fi

 # Copy the test data to the test run directory
 DataDir=$TEST_RUN_DIR/TestData
 mkdir $DataDir
-cp -R $DataSourceDir/Train-28x28_cntk_text.txt $DataDir || exit $?
+cp -R $DataSourceDir/MNIST/v0/Train-28x28_cntk_text.txt $DataDir || exit $?
+cp -R $DataSourceDir/CIFAR/v0/cifar-10-batches-py $DataDir || exit $?
 cp -R $TEST_DIR/../../../../Examples/Other/Simple2d/Data/SimpleDataTrain_cntk_text.txt $DataDir || exit $?

 pushd $DataDir
--- a/Tests/UnitTests/V2LibraryTests/CifarResNet.cpp
+++ b/Tests/UnitTests/V2LibraryTests/CifarResNet.cpp
@ -0,0 +1,175 @@
+#include "CNTKLibrary.h"
+#include <functional>
+#include "Common.h"
+#include "Image.h"
+
+using namespace CNTK;
+
+MinibatchSourcePtr CreateCifarMinibatchSource(size_t epochSize)
+{
+    size_t imageHeight = 32;
+    size_t imageWidth = 32;
+    size_t numChannels = 3;
+    size_t numClasses = 10;
+    auto mapFilePath = L"cifar-10-batches-py/train_map.txt";
+    auto meanFilePath = L"cifar-10-batches-py/CIFAR-10_mean.xml";
+
+    Dictionary cropTransformConfig;
+    cropTransformConfig[L"type"] = L"Crop";
+    cropTransformConfig[L"cropType"] = L"Random";
+    cropTransformConfig[L"cropRatio"] = L"0.8";
+    cropTransformConfig[L"jitterType"] = L"uniRatio";
+
+    Dictionary scaleTransformConfig;
+    scaleTransformConfig[L"type"] = L"Scale";
+    scaleTransformConfig[L"width"] = imageWidth;
+    scaleTransformConfig[L"height"] = imageHeight;
+    scaleTransformConfig[L"channels"] = numChannels;
+    scaleTransformConfig[L"interpolations"] = L"linear";
+
+    Dictionary meanTransformConfig;
+    meanTransformConfig[L"type"] = L"Mean";
+    meanTransformConfig[L"meanFile"] = meanFilePath;
+
+    std::vector<DictionaryValue> allTransforms = { cropTransformConfig, scaleTransformConfig, meanTransformConfig };
+
+    Dictionary featuresStreamConfig;
+    featuresStreamConfig[L"transforms"] = allTransforms;
+
+    Dictionary labelsStreamConfig;
+    labelsStreamConfig[L"labelDim"] = numClasses;
+
+    Dictionary inputStreamsConfig;
+    inputStreamsConfig[L"features"] = featuresStreamConfig;
+    inputStreamsConfig[L"labels"] = labelsStreamConfig;
+
+    Dictionary deserializerConfiguration;
+    deserializerConfiguration[L"type"] = L"ImageDeserializer";
+    deserializerConfiguration[L"module"] = L"ImageReader";
+    deserializerConfiguration[L"file"] = mapFilePath;
+    deserializerConfiguration[L"input"] = inputStreamsConfig;
+
+    Dictionary minibatchSourceConfiguration;
+    minibatchSourceConfiguration[L"epochSize"] = epochSize;
+    minibatchSourceConfiguration[L"deserializers"] = std::vector<DictionaryValue>({ deserializerConfiguration });
+
+    return CreateCompositeMinibatchSource(minibatchSourceConfiguration);
+}
+
+Constant GetProjectionMap(size_t outputDim, size_t inputDim, const DeviceDescriptor& device)
+{
+    if (inputDim > outputDim)
+        throw std::runtime_error("Can only project from lower to higher dimensionality");
+
+    std::vector<float> projectionMapValues(inputDim * outputDim);
+    for (size_t i = 0; i < inputDim; ++i)
+        projectionMapValues[(i * outputDim) + i] = 1.0f;
+
+    auto projectionMap = MakeSharedObject<NDArrayView>(DataType::Float, NDShape({ outputDim, 1, 1, inputDim }), device);
+    projectionMap->CopyFrom(NDArrayView(NDShape({ outputDim, 1, 1, inputDim }), projectionMapValues));
+
+    return Constant(projectionMap);
+}
+
+FunctionPtr ResNetClassifier(Variable input, size_t numOutputClasses, const DeviceDescriptor& device, const std::wstring& outputName)
+{
+    double convWScale = 7.07;
+    double convBValue = 0;
+
+    double fc1WScale = 0.4;
+    double fc1BValue = 0;
+
+    double scValue = 1;
+    size_t bnTimeConst = 4096;
+
+    size_t kernelWidth = 3;
+    size_t kernelHeight = 3;
+
+    double conv1WScale = 0.26;
+    size_t cMap1 = 16;
+    auto conv1 = ConvBNReLULayer(input, cMap1, kernelWidth, kernelHeight, 1, 1, conv1WScale, convBValue, scValue, bnTimeConst, device);
+
+    auto rn1_1 = ResNetNode2(conv1, cMap1, kernelWidth, kernelHeight, convWScale, convBValue, scValue, bnTimeConst, device);
+    auto rn1_2 = ResNetNode2(rn1_1, cMap1, kernelWidth, kernelHeight, convWScale, convBValue, scValue, bnTimeConst, device);
+    auto rn1_3 = ResNetNode2(rn1_2, cMap1, kernelWidth, kernelHeight, convWScale, convBValue, scValue, bnTimeConst, device);
+
+    size_t cMap2 = 32;
+    auto rn2_1_wProj = GetProjectionMap(cMap2, cMap1, device);
+    auto rn2_1 = ResNetNode2Inc(rn1_3, cMap2, kernelWidth, kernelHeight, convWScale, convBValue, scValue, bnTimeConst, rn2_1_wProj, device);
+    auto rn2_2 = ResNetNode2(rn2_1, cMap2, kernelWidth, kernelHeight, convWScale, convBValue, scValue, bnTimeConst, device);
+    auto rn2_3 = ResNetNode2(rn2_2, cMap2, kernelWidth, kernelHeight, convWScale, convBValue, scValue, bnTimeConst, device);
+
+    size_t cMap3 = 64;
+    auto rn3_1_wProj = GetProjectionMap(cMap3, cMap2, device);
+    auto rn3_1 = ResNetNode2Inc(rn2_3, cMap3, kernelWidth, kernelHeight, convWScale, convBValue, scValue, bnTimeConst, rn3_1_wProj, device);
+    auto rn3_2 = ResNetNode2(rn3_1, cMap3, kernelWidth, kernelHeight, convWScale, convBValue, scValue, bnTimeConst, device);
+    auto rn3_3 = ResNetNode2(rn3_2, cMap3, kernelWidth, kernelHeight, convWScale, convBValue, scValue, bnTimeConst, device);
+
+    // Global average pooling
+    size_t poolW = 8;
+    size_t poolH = 8;
+    size_t poolhStride = 1;
+    size_t poolvStride = 1;
+    //size_t numInputChannels = rn3_3->Output().Shape()[rn3_3->Output().Shape().NumAxes() - 1];
+    auto pool = Pooling(rn3_3, PoolingType::Average, { poolW, poolH, 1 }, { poolhStride, poolvStride, 1 });
+
+    // Output DNN layer
+    auto outTimesParams = Parameter(NDArrayView::RandomNormal<float>({ numOutputClasses, 1, 1, cMap3 }, 0.0, fc1WScale, 1, device));
+    auto outBiasParams = Parameter({ numOutputClasses }, (float)fc1BValue, device);
+
+    return Plus(Times(outTimesParams, pool), outBiasParams, outputName);
+}
+
+void TrainResNetCifarClassifer(const DeviceDescriptor& device, bool testSaveAndReLoad)
+{
+    auto minibatchSource = CreateCifarMinibatchSource(SIZE_MAX);
+    auto streamInfos = minibatchSource->StreamInfos();
+    auto imageStreamInfo = std::find_if(streamInfos.begin(), streamInfos.end(), [](const StreamInfo& streamInfo) { return (streamInfo.m_name == L"features"); });
+    auto labelStreamInfo = std::find_if(streamInfos.begin(), streamInfos.end(), [](const StreamInfo& streamInfo) { return (streamInfo.m_name == L"labels"); });
+
+    auto inputImageShape = imageStreamInfo->m_sampleLayout;
+    // Change the input shape from HWC to CHW form
+    inputImageShape = { inputImageShape[1], inputImageShape[2], inputImageShape[0] };
+
+    const size_t numOutputClasses = labelStreamInfo->m_sampleLayout[0];
+
+    Variable imageInput(inputImageShape, imageStreamInfo->m_elementType, L"Images");
+    auto classifierOutputFunction = ResNetClassifier(imageInput, numOutputClasses, device, L"classifierOutput");
+    Variable classifierOutput = classifierOutputFunction;
+
+    auto labelsVar = Variable({ numOutputClasses }, labelStreamInfo->m_elementType, L"Labels");
+
+    auto trainingLossFunction = CrossEntropyWithSoftmax(classifierOutputFunction, labelsVar, L"lossFunction");
+    Variable trainingLoss = trainingLossFunction;
+    auto predictionFunction = ClassificationError(classifierOutputFunction, labelsVar, L"predictionError");
+    Variable prediction = predictionFunction;
+
+    auto imageClassifier = Combine({ trainingLossFunction, predictionFunction, classifierOutputFunction }, L"ImageClassifier");
+
+    if (testSaveAndReLoad)
+        SaveAndReloadModel<float>(imageClassifier, { &imageInput, &labelsVar, &trainingLoss, &prediction, &classifierOutput }, device);
+
+    double learningRatePerSample = 0.0078125;
+
+    Trainer trainer(imageClassifier, trainingLoss, { SGDLearner(imageClassifier->Parameters(), learningRatePerSample) });
+    const size_t minibatchSize = 32;
+    size_t numMinibatchesToTrain = 100;
+    std::unordered_map<StreamInfo, std::pair<size_t, size_t>> minibatchSizeLimits = { { *imageStreamInfo, std::make_pair((size_t)0, minibatchSize) }, { *labelStreamInfo, std::make_pair((size_t)0, minibatchSize) } };
+    size_t outputFrequencyInMinibatches = 20;
+    for (size_t i = 0; i < numMinibatchesToTrain; ++i)
+    {
+        auto minibatchData = minibatchSource->GetNextMinibatch(minibatchSizeLimits, device);
+        trainer.TrainMinibatch({ { imageInput, minibatchData[*imageStreamInfo].m_data }, { labelsVar, minibatchData[*labelStreamInfo].m_data } }, device);
+
+        if ((i % outputFrequencyInMinibatches) == 0)
+        {
+            float trainLossValue = PrevMinibatchTrainingLossValue(trainer);
+            printf("Minibatch %d: CrossEntropy loss = %.8g\n", (int)i, trainLossValue);
+        }
+    }
+}
+
+void TestCifarResnet()
+{
+    TrainResNetCifarClassifer(DeviceDescriptor::GPUDevice(0), true /*testSaveAndReLoad*/);
+}
--- a/Tests/UnitTests/V2LibraryTests/Common.h
+++ b/Tests/UnitTests/V2LibraryTests/Common.h
@ -101,5 +101,14 @@ inline CNTK::FunctionPtr FullyConnectedDNNLayer(CNTK::Variable input, size_t out
    return nonLinearity(plusFunction);
 }

+inline float PrevMinibatchTrainingLossValue(const CNTK::Trainer& trainer)
+{
+    float trainLossValue = 0.0;
+    auto prevMBTrainingLossValue = trainer.PreviousMinibatchTrainingLossValue()->Data();
+    CNTK::NDArrayView cpuTrainLossValue(prevMBTrainingLossValue->Shape(), &trainLossValue, 1, CNTK::DeviceDescriptor::CPUDevice());
+    cpuTrainLossValue.CopyFrom(*prevMBTrainingLossValue);
+
+    return trainLossValue;
+}

 #pragma warning(pop)
--- a/Tests/UnitTests/V2LibraryTests/FeedForwardTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/FeedForwardTests.cpp
@ -18,7 +18,7 @@ FunctionPtr FullyConnectedFeedForwardClassifierNet(Variable input,
        classifierRoot = FullyConnectedDNNLayer(classifierRoot, hiddenLayerDim, device, nonLinearity);

    auto outputTimesParam = Parameter(NDArrayView::RandomUniform<float>({ numOutputClasses, hiddenLayerDim }, -0.5, 0.5, 1, device));
-    return Times(outputTimesParam, classifierRoot, outputName);
+    return Times(outputTimesParam, classifierRoot, 1, outputName);
 }

 std::wstring s_tempModelPath = L"feedForward.net";
--- a/Tests/UnitTests/V2LibraryTests/Image.h
+++ b/Tests/UnitTests/V2LibraryTests/Image.h
@ -0,0 +1,78 @@
+#include "CNTKLibrary.h"
+
+using namespace CNTK;
+
+inline FunctionPtr ConvBNLayer(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, size_t hStride, size_t vStride, double wScale, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)
+{
+    size_t numInputChannels = input.Shape()[input.Shape().NumAxes() - 1];
+
+    auto convParams = Parameter(NDArrayView::RandomNormal<float>({ outFeatureMapCount, kernelWidth, kernelHeight, numInputChannels }, 0.0, wScale, 1, device));
+    auto convFunction = Convolution(convParams, input, { hStride, vStride, numInputChannels });
+
+    auto biasParams = Parameter({ outFeatureMapCount }, (float)bValue, device);
+    auto scaleParams = Parameter({ outFeatureMapCount }, (float)scValue, device);
+    auto runningMean = Constant({ outFeatureMapCount }, 0.0f, device);
+    auto runningInvStd = Constant({ outFeatureMapCount }, 0.0f, device);
+    return BatchNormalization(convFunction, scaleParams, biasParams, runningMean, runningInvStd, true /*spatial*/, (double)bnTimeConst, 0.0, 0.000000001 /* epsilon */);
+}
+
+inline FunctionPtr ConvBNReLULayer(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, size_t hStride, size_t vStride, double wScale, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)
+{
+    auto convBNFunction = ConvBNLayer(input, outFeatureMapCount, kernelWidth, kernelHeight, hStride, vStride, wScale, bValue, scValue, bnTimeConst, device);
+    return ReLU(convBNFunction);
+}
+
+inline FunctionPtr ProjLayer(Variable wProj, Variable input, size_t hStride, size_t vStride, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)
+{
+    size_t outFeatureMapCount = wProj.Shape()[0];
+    auto b = Parameter({ outFeatureMapCount }, (float)bValue, device);
+    auto sc = Parameter({ outFeatureMapCount }, (float)scValue, device);
+    auto m = Constant({ outFeatureMapCount }, 0.0f, device);
+    auto isd = Constant({ outFeatureMapCount }, 0.0f, device);
+
+    size_t numInputChannels = input.Shape()[input.Shape().NumAxes() - 1];
+
+    auto c = Convolution(wProj, input, { hStride, vStride, numInputChannels }, { true }, { false });
+    return BatchNormalization(c, sc, b, m, isd, true /*spatial*/, (double)bnTimeConst);
+}
+
+inline FunctionPtr ResNetNode2(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, double wScale, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)
+{
+    auto c1 = ConvBNReLULayer(input, outFeatureMapCount, kernelWidth, kernelHeight, 1, 1, wScale, bValue, scValue, bnTimeConst, device);
+    auto c2 = ConvBNLayer(c1, outFeatureMapCount, kernelWidth, kernelHeight, 1, 1, wScale, bValue, scValue, bnTimeConst, device);
+    auto p = Plus(c2, input);
+    return ReLU(p);
+}
+
+inline FunctionPtr ResNetNode2Inc(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, double wScale, double bValue, double scValue, size_t bnTimeConst, Variable wProj, const DeviceDescriptor& device)
+{
+    auto c1 = ConvBNReLULayer(input, outFeatureMapCount, kernelWidth, kernelHeight, 2, 2, wScale, bValue, scValue, bnTimeConst, device);
+    auto c2 = ConvBNLayer(c1, outFeatureMapCount, kernelWidth, kernelHeight, 1, 1, wScale, bValue, scValue, bnTimeConst, device);
+
+    auto cProj = ProjLayer(wProj, input, 2, 2, bValue, scValue, bnTimeConst, device);
+
+    auto p = Plus(c2, cProj);
+    return ReLU(p);
+}
+
+// Standard building block for ResNet with identity shortcut(option A).
+inline FunctionPtr ResNetNode2A(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, double wScale, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)
+{
+    auto conv1 = ConvBNReLULayer(input, outFeatureMapCount, kernelWidth, kernelHeight, 1, 1, wScale, bValue, scValue, bnTimeConst, device);
+    auto conv2 = ConvBNLayer(conv1, outFeatureMapCount, kernelWidth, kernelHeight, 1, 1, wScale, bValue, scValue, bnTimeConst, device);
+
+    // Identity shortcut followed by ReLU.
+    return ReLU(Plus(conv2, input));
+}
+
+// Standard building block for ResNet with padding(option B).
+inline FunctionPtr ResNetNode2BInc(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, double wScale, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)
+{
+    auto conv1 = ConvBNReLULayer(input, outFeatureMapCount, kernelWidth, kernelHeight, 2, 2, wScale, bValue, scValue, bnTimeConst, device);
+    auto conv2 = ConvBNLayer(conv1, outFeatureMapCount, kernelWidth, kernelHeight, 1, 1, wScale, bValue, scValue, bnTimeConst, device);
+
+    // Projection convolution layer.
+    auto cProj = ConvBNLayer(input, outFeatureMapCount, 1, 1, 2, 2, wScale, bValue, scValue, bnTimeConst, device);
+    return ReLU(Plus(conv2, cProj));
+}
+
--- a/Tests/UnitTests/V2LibraryTests/Main.cpp
+++ b/Tests/UnitTests/V2LibraryTests/Main.cpp
@ -1,20 +1,27 @@
 #include "CNTKLibrary.h"
 #include <functional>

+using namespace CNTK;
+
 void NDArrayViewTests();
 void TensorTests();
 void FeedForwardTests();
 void RecurrentFunctionTests();
 void TrainerTests();
+void TestCifarResnet();

 int main()
 {
    NDArrayViewTests();
    TensorTests();
+
    FeedForwardTests();
    RecurrentFunctionTests();
+
    TrainerTests();

+    TestCifarResnet();
+
    fprintf(stderr, "\nCNTKv2Library tests: Passed\n");
    fflush(stderr);
 }
--- a/Tests/UnitTests/V2LibraryTests/TrainerTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/TrainerTests.cpp
@ -33,16 +33,6 @@ MinibatchSourcePtr CreateTextMinibatchSource(const std::wstring& filePath, size_
    return CreateCompositeMinibatchSource(minibatchSourceConfiguration);
 }

-float PrevMinibatchTrainingLossValue(const Trainer& trainer)
-{
-    float trainLossValue = 0.0;
-    auto prevMBTrainingLossValue = trainer.PreviousMinibatchTrainingLossValue()->Data();
-    NDArrayView cpuTrainLossValue(prevMBTrainingLossValue->Shape(), &trainLossValue, 1, DeviceDescriptor::CPUDevice());
-    cpuTrainLossValue.CopyFrom(*prevMBTrainingLossValue);
-
-    return trainLossValue;
-}
-
 void TrainSimpleFeedForwardClassifer(const DeviceDescriptor& device)
 {
    const size_t inputDim = 2;
@ -50,9 +40,23 @@ void TrainSimpleFeedForwardClassifer(const DeviceDescriptor& device)
    const size_t hiddenLayerDim = 50;
    const size_t numHiddenLayers = 2;

+    const size_t minibatchSize = 25;
+    const size_t numSamplesPerSweep = 10000;
+    const size_t numSweepsToTrainWith = 2;
+    const size_t numMinibatchesToTrain = (numSamplesPerSweep * numSweepsToTrainWith) / minibatchSize;
+
+    auto minibatchSource = CreateTextMinibatchSource(L"SimpleDataTrain_cntk_text.txt", (size_t)2, (size_t)2, 0);
+    auto streamInfos = minibatchSource->StreamInfos();
+    auto featureStreamInfo = std::find_if(streamInfos.begin(), streamInfos.end(), [](const StreamInfo& streamInfo) { return (streamInfo.m_name == L"features"); });
+    auto labelStreamInfo = std::find_if(streamInfos.begin(), streamInfos.end(), [](const StreamInfo& streamInfo) { return (streamInfo.m_name == L"labels"); });
+
+    std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>> inputMeansAndInvStdDevs = { { *featureStreamInfo, { nullptr, nullptr } } };
+    ComputeInputPerDimMeansAndInvStdDevs(minibatchSource, inputMeansAndInvStdDevs);
+
    auto nonLinearity = std::bind(Sigmoid, _1, L"");
    Variable input({ inputDim }, DataType::Float, L"features");
-    auto classifierOutput = FullyConnectedDNNLayer(input, hiddenLayerDim, device, nonLinearity);
+    auto normalizedinput = PerDimMeanVarianceNormalize(input, inputMeansAndInvStdDevs[*featureStreamInfo].first, inputMeansAndInvStdDevs[*featureStreamInfo].second);
+    auto classifierOutput = FullyConnectedDNNLayer(normalizedinput, hiddenLayerDim, device, nonLinearity);
    for (size_t i = 1; i < numHiddenLayers; ++i)
        classifierOutput = FullyConnectedDNNLayer(classifierOutput, hiddenLayerDim, device, nonLinearity);

@ -66,33 +70,23 @@ void TrainSimpleFeedForwardClassifer(const DeviceDescriptor& device)

    auto oneHiddenLayerClassifier = CNTK::Combine({ trainingLoss, prediction, classifierOutput }, L"classifierModel");

-    const size_t minibatchSize = 25;
-    const size_t numSamplesPerSweep = 10000;
-    const size_t numSweepsToTrainWith = 2;
-    const size_t numMinibatchesToTrain = (numSamplesPerSweep * numSweepsToTrainWith) / minibatchSize;
-
-    auto minibatchSource = CreateTextMinibatchSource(L"SimpleDataTrain_cntk_text.txt", (size_t)2, (size_t)2, numSamplesPerSweep);
-
-    auto streamInfos = minibatchSource->StreamInfos();
-    auto featureStreamInfo = std::find_if(streamInfos.begin(), streamInfos.end(), [](const StreamInfo& streamInfo) { return (streamInfo.m_name == L"features"); });
-    auto labelStreamInfo = std::find_if(streamInfos.begin(), streamInfos.end(), [](const StreamInfo& streamInfo) { return (streamInfo.m_name == L"labels"); });
-
    double learningRatePerSample = 0.02;
+    minibatchSource = CreateTextMinibatchSource(L"SimpleDataTrain_cntk_text.txt", (size_t)2, (size_t)2, SIZE_MAX);
    Trainer trainer(oneHiddenLayerClassifier, trainingLoss, { SGDLearner(oneHiddenLayerClassifier->Parameters(), learningRatePerSample) });
-    std::unordered_map<StreamInfo, std::pair<size_t, ValuePtr>> minibatchData = { { *featureStreamInfo, { minibatchSize, nullptr } }, { *labelStreamInfo, { minibatchSize, nullptr } } };
+    std::unordered_map<StreamInfo, std::pair<size_t, size_t>> minibatchSizeLimits = { { *featureStreamInfo, std::make_pair((size_t)0, minibatchSize) }, { *labelStreamInfo, std::make_pair((size_t)0, minibatchSize) } };
    size_t outputFrequencyInMinibatches = 20;
    for (size_t i = 0; i < numMinibatchesToTrain; ++i)
    {
-        minibatchSource->GetNextMinibatch(minibatchData);
-        trainer.TrainMinibatch({ { input, minibatchData[*featureStreamInfo].second }, { labels, minibatchData[*labelStreamInfo].second } }, device);
+        auto minibatchData = minibatchSource->GetNextMinibatch(minibatchSizeLimits, device);
+        trainer.TrainMinibatch({ { input, minibatchData[*featureStreamInfo].m_data }, { labels, minibatchData[*labelStreamInfo].m_data } }, device);

        if ((i % outputFrequencyInMinibatches) == 0)
        {
            float trainLossValue = PrevMinibatchTrainingLossValue(trainer);
-        printf("Minibatch %d: CrossEntropy loss = %.8g\n", (int)i, trainLossValue);
+            printf("Minibatch %d: CrossEntropy loss = %.8g\n", (int)i, trainLossValue);
+        }
    }
 }
-}

 void TrainMNISTClassifier(const DeviceDescriptor& device)
 {
@ -118,7 +112,7 @@ void TrainMNISTClassifier(const DeviceDescriptor& device)
    const size_t numSweepsToTrainWith = 3;
    const size_t numMinibatchesToTrain = (numSamplesPerSweep * numSweepsToTrainWith) / minibatchSize;

-    auto minibatchSource = CreateTextMinibatchSource(L"Train-28x28_cntk_text.txt", (size_t)784, (size_t)10, numSamplesPerSweep);
+    auto minibatchSource = CreateTextMinibatchSource(L"Train-28x28_cntk_text.txt", (size_t)784, (size_t)10, SIZE_MAX);

    auto streamInfos = minibatchSource->StreamInfos();
    auto featureStreamInfo = std::find_if(streamInfos.begin(), streamInfos.end(), [](const StreamInfo& streamInfo) {
@ -130,17 +124,17 @@ void TrainMNISTClassifier(const DeviceDescriptor& device)

    double learningRatePerSample = 0.003125;
    Trainer trainer(oneHiddenLayerClassifier, trainingLoss, { SGDLearner(oneHiddenLayerClassifier->Parameters(), learningRatePerSample) });
-    std::unordered_map<StreamInfo, std::pair<size_t, ValuePtr>> minibatchData = { { *featureStreamInfo, { minibatchSize, nullptr } }, { *labelStreamInfo, { minibatchSize, nullptr } } };
+    std::unordered_map<StreamInfo, std::pair<size_t, size_t>> minibatchSizeLimits = { { *featureStreamInfo, std::make_pair((size_t)0, minibatchSize) }, { *labelStreamInfo, std::make_pair((size_t)0, minibatchSize) } };
    size_t outputFrequencyInMinibatches = 20;
    for (size_t i = 0; i < numMinibatchesToTrain; ++i)
    {
-        minibatchSource->GetNextMinibatch(minibatchData);
-        trainer.TrainMinibatch({ { input, minibatchData[*featureStreamInfo].second }, { labels, minibatchData[*labelStreamInfo].second } }, device);
+        auto minibatchData = minibatchSource->GetNextMinibatch(minibatchSizeLimits, device);
+        trainer.TrainMinibatch({ { input, minibatchData[*featureStreamInfo].m_data }, { labels, minibatchData[*labelStreamInfo].m_data } }, device);

        if ((i % outputFrequencyInMinibatches) == 0)
        {
            float trainLossValue = PrevMinibatchTrainingLossValue(trainer);
-        printf("Minibatch %d: CrossEntropy loss = %.8g\n", (int)i, trainLossValue);
+            printf("Minibatch %d: CrossEntropy loss = %.8g\n", (int)i, trainLossValue);
        }
    }
 }
--- a/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj
+++ b/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj
@ -109,6 +109,7 @@
    </ClCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
+    <ClCompile Include="CifarResNet.cpp" />
    <ClCompile Include="FeedForwardTests.cpp" />
    <ClCompile Include="Main.cpp" />
    <ClCompile Include="NDArrayViewTests.cpp" />
@ -118,6 +119,7 @@
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="Common.h" />
+    <ClInclude Include="Image.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj.filters
+++ b/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj.filters
@ -33,10 +33,16 @@
    <ClCompile Include="TrainerTests.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="CifarResNet.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="Common.h">
      <Filter>Header Files</Filter>
    </ClInclude>
+    <ClInclude Include="Image.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
  </ItemGroup>
 </Project>